In [1]:
from fastai.imports import *
import warnings, logging

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [2]:
from datasets import Dataset, DatasetDict

# Explorative Data Analysis

In [3]:
import os
from pathlib import Path
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

if iskaggle:
    path = Path('../input/us-patent-phrase-to-phrase-matching')
    ! pip install -q datasets
path = Path('us-patent-phrase-to-phrase-matching')

if iskaggle:
    path = Path('../input/us-patent-phrase-to-phrase-matching')
    
print(path)  
!ls {path}

[0m../input/us-patent-phrase-to-phrase-matching
sample_submission.csv  test.csv  train.csv


In [4]:
df = pd.read_csv(path / 'train.csv')
df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [5]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


In [6]:
df.isnull().sum().sum()

0

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36473 entries, 0 to 36472
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       36473 non-null  object 
 1   anchor   36473 non-null  object 
 2   target   36473 non-null  object 
 3   context  36473 non-null  object 
 4   score    36473 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1.4+ MB


In [8]:
def value_counts(data, columns):
    for c in columns:
        print(f'\n --- {c} --- \n')
        v = data[c].value_counts()
        print(f'{v}')
        print(f'{c}.max = {v.max()}, {c}.mean = {v.mean()}, {c}.min = {v.min()}')
value_counts(df, ['context', 'anchor', 'target', 'score'])



 --- context --- 

H01    2186
H04    2177
G01    1812
A61    1477
F16    1091
       ... 
B03      47
F17      33
B31      24
A62      23
F26      18
Name: context, Length: 106, dtype: int64
context.max = 2186, context.mean = 344.08490566037733, context.min = 18

 --- anchor --- 

component composite coating              152
sheet supply roller                      150
source voltage                           140
perfluoroalkyl group                     136
el display                               135
                                        ... 
plug nozzle                                2
shannon                                    2
dry coating composition1                   2
peripheral nervous system stimulation      1
conduct conducting material                1
Name: anchor, Length: 733, dtype: int64
anchor.max = 152, anchor.mean = 49.758526603001364, anchor.min = 1

 --- target --- 

composition                    24
data                           22
metal                      

In [9]:
df.target.str.count(' ').max(), df.anchor.str.count(' ').max()

(14, 4)

In [10]:
df.anchor.value_counts().mean(), df.anchor.value_counts().max()

(49.758526603001364, 152)

In [11]:
df['section'] = df.context.str[0]

In [12]:
value_counts(df, ['section', 'context', 'anchor', 'score'])


 --- section --- 

B    8019
H    6195
G    6013
C    5288
A    4094
F    4054
E    1531
D    1279
Name: section, dtype: int64
section.max = 8019, section.mean = 4559.125, section.min = 1279

 --- context --- 

H01    2186
H04    2177
G01    1812
A61    1477
F16    1091
       ... 
B03      47
F17      33
B31      24
A62      23
F26      18
Name: context, Length: 106, dtype: int64
context.max = 2186, context.mean = 344.08490566037733, context.min = 18

 --- anchor --- 

component composite coating              152
sheet supply roller                      150
source voltage                           140
perfluoroalkyl group                     136
el display                               135
                                        ... 
plug nozzle                                2
shannon                                    2
dry coating composition1                   2
peripheral nervous system stimulation      1
conduct conducting material                1
Name: anchor, Length: 733, dt

## 数据量与分布小结
- 原始数据 36473 rows × 5 columns，没有空值，没有异常值。
- 其中包含有 2 种分组类型：context, anchor, 和一个数值结果： score
- context 分组数据行数 max = 2186， mean = 344, min = 18     部分数据不足
- anchor 分组数据行数 max = 152, mean = 49, min = 1，        部分数据不足
- score  分组数据行数 max = 12300, mean = 7294, min = 1154， 数据充足

增强 context 分级特征
- section 分组数据行数 max = 8019, mean = 4559, min = 1279, 数据充足

## 字符型数据状况

In [13]:
def has_uppercase(data, columns):
    for c in columns:
        up = any([ch.isupper() for ch in data[c]])
        print(f'{c} {"has" if up else "has no"} upper case')

In [14]:
has_uppercase(df, ['context', 'anchor', 'target'])

context has upper case
anchor has no upper case
target has no upper case


只有 context 有 upper case  情况，整体数据都比较干净

# 设计验收数据集

In [15]:
def split_train_valid(data, column):
    u_data = list(data[column].unique())
    np.random.seed(42)
    np.random.shuffle(u_data)
    
    val_prop = 0.25
    val_size = int(len(u_data) * val_prop)
    val_data = u_data[:val_size]
    is_val = data.anchor.isin(val_data)
    
    idxs = np.arange(len(is_val))
    trn_idxs = idxs[~is_val]
    val_idxs = idxs[is_val]
    return trn_idxs, val_idxs

In [16]:
trn_idxs, val_idxs = split_train_valid(df, 'anchor')
len(trn_idxs), len(val_idxs), trn_idxs, val_idxs

(27357,
 9116,
 array([    0,     1,     2, ..., 36470, 36471, 36472]),
 array([  131,   132,   133, ..., 36414, 36415, 36416]))

In [17]:
from datasets import DatasetDict
def get_dataset_dict(dataset, trn_idxs, val_idxs):
    return DatasetDict({
      "train" : dataset.select(trn_idxs),
      "test" : dataset.select(val_idxs)
    })

## 方案一 数据预处理

In [18]:
from transformers import AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_nm)
tokenizer.all_special_tokens

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

['[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]']

In [19]:
def tokenize(dataset, projection):
    return dataset.map(lambda x: tokenizer(x[projection]))

In [20]:
def mk_inputs(data_frame, sep, columns):
    return data_frame[columns].apply(lambda x: sep.join(x), axis = 1)

In [21]:
input_columns =  ['context', 'target', 'anchor']
sep = f' {tokenizer.sep_token} '
df['inputs'] = mk_inputs(df, sep, input_columns)
df.head()

Unnamed: 0,id,anchor,target,context,score,section,inputs
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,A47 [SEP] abatement of pollution [SEP] abatement
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,A47 [SEP] act of abating [SEP] abatement
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,A47 [SEP] active catalyst [SEP] abatement
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,A47 [SEP] eliminating process [SEP] abatement
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,A47 [SEP] forest region [SEP] abatement


In [22]:
from datasets import Dataset
dataset = Dataset.from_pandas(df).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs'])
dataset = dataset.rename_column('score', 'label')
dataset[0]

  0%|          | 0/37 [00:00<?, ?ba/s]

{'label': 0.5,
 'input_ids': [1, 336, 5753, 2, 47284, 265, 6435, 2, 47284, 2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [23]:
dds = get_dataset_dict(dataset,trn_idxs, val_idxs)
dds

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27357
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9116
    })
})

## 准备 transformer arguments, model and trainer

In [24]:
def corr(eval_preds): return { "Pearson": np.corrcoef(*eval_preds)[0][1] }

In [25]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Downloading pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

In [26]:
bs = 128
lr = 8e-5
epochs = 4
args = TrainingArguments(output_dir='outputs', evaluation_strategy='epoch', 
                         weight_decay=0.01, warmup_ratio=0.1,
                         per_device_train_batch_size= bs, per_device_eval_batch_size= bs * 2,
                         num_train_epochs=epochs, lr_scheduler_type='cosine', fp16=True,
                         report_to='none')

In [27]:
trainer = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds['train'], eval_dataset=dds['test'], compute_metrics=corr)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.028688,0.789305
2,No log,0.024615,0.80972
3,0.040000,0.024494,0.812765
4,0.040000,0.025285,0.812865


TrainOutput(global_step=856, training_loss=0.030126766623737655, metrics={'train_runtime': 238.4801, 'train_samples_per_second': 458.856, 'train_steps_per_second': 3.589, 'total_flos': 469191313065600.0, 'train_loss': 0.030126766623737655, 'epoch': 4.0})

## 方案二 数据预处理 —— context => [context]

In [29]:
df2 = pd.read_csv(path / 'train.csv')
df2['section'] = df2.context.str[0]
df2['context_2'] = '[' + df2.context + ']'

In [30]:
input_columns =  ['context_2', 'target', 'anchor']
sep = f' {tokenizer.sep_token} '
df2['inputs'] = mk_inputs(df2, sep, input_columns)
df2.head()

Unnamed: 0,id,anchor,target,context,score,section,context_2,inputs
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,[A47],[A47] [SEP] abatement of pollution [SEP] abatement
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,[A47],[A47] [SEP] act of abating [SEP] abatement
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,[A47],[A47] [SEP] active catalyst [SEP] abatement
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,[A47],[A47] [SEP] eliminating process [SEP] abatement
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,[A47],[A47] [SEP] forest region [SEP] abatement


In [31]:
dataset2 = Dataset.from_pandas(df2).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs', 'context'])
dataset2 = dataset2.rename_column('score', 'label')
dataset2

  0%|          | 0/37 [00:00<?, ?ba/s]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36473
})

In [32]:
dds2 = get_dataset_dict(dataset2,trn_idxs, val_idxs)
dds2

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27357
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9116
    })
})

In [33]:
trainer2 = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds2['train'], eval_dataset=dds2['test'], compute_metrics=corr)

In [34]:
trainer2.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.032373,0.804015
2,No log,0.024202,0.811386
3,0.015000,0.024958,0.812921
4,0.015000,0.025048,0.813124


TrainOutput(global_step=856, training_loss=0.012946988377615671, metrics={'train_runtime': 237.9435, 'train_samples_per_second': 459.891, 'train_steps_per_second': 3.597, 'total_flos': 525815923757040.0, 'train_loss': 0.012946988377615671, 'epoch': 4.0})

## 方案三 数据预处理 —— 自定义分隔符 [SEP] => [S]

In [35]:
df3 = pd.read_csv(path / 'train.csv')
df3['section'] = df3.context.str[0]
df3['context_2'] = '[' + df3.context + ']'

input_columns =  ['context_2', 'target', 'anchor']
sep = f' [S] '
df3['inputs'] = mk_inputs(df3, sep, input_columns)
df3.head()

Unnamed: 0,id,anchor,target,context,score,section,context_2,inputs
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,[A47],[A47] [S] abatement of pollution [S] abatement
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,[A47],[A47] [S] act of abating [S] abatement
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,[A47],[A47] [S] active catalyst [S] abatement
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,[A47],[A47] [S] eliminating process [S] abatement
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,[A47],[A47] [S] forest region [S] abatement


In [36]:
dataset3 = Dataset.from_pandas(df3).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs', 'context'])
dataset3 = dataset3.rename_column('score', 'label')
dataset3

  0%|          | 0/37 [00:00<?, ?ba/s]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36473
})

In [37]:
dds3 = get_dataset_dict(dataset3,trn_idxs, val_idxs)
dds3

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27357
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9116
    })
})

In [38]:
trainer3 = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds3['train'], eval_dataset=dds3['test'], compute_metrics=corr)

In [39]:
trainer3.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.027783,0.805532
2,No log,0.025217,0.807099
3,0.010900,0.024913,0.809814
4,0.010900,0.025305,0.809015


TrainOutput(global_step=856, training_loss=0.009570876014566868, metrics={'train_runtime': 252.067, 'train_samples_per_second': 434.123, 'train_steps_per_second': 3.396, 'total_flos': 639065145139920.0, 'train_loss': 0.009570876014566868, 'epoch': 4.0})

## 方案四 数据预处理 —— inputs 小写

In [40]:
df4 = pd.read_csv(path / 'train.csv')
df4['section'] = df4.context.str[0]
df4['context_2'] = '[' + df4.context + ']'

input_columns =  ['context_2', 'target', 'anchor']
sep = f' [S] '
df4['inputs'] = mk_inputs(df4, sep, input_columns).str.lower()
df4.head()

Unnamed: 0,id,anchor,target,context,score,section,context_2,inputs
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,[A47],[a47] [s] abatement of pollution [s] abatement
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,[A47],[a47] [s] act of abating [s] abatement
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,[A47],[a47] [s] active catalyst [s] abatement
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,[A47],[a47] [s] eliminating process [s] abatement
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,[A47],[a47] [s] forest region [s] abatement


In [41]:
dataset4 = Dataset.from_pandas(df4).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs', 'context'])
dataset4 = dataset4.rename_column('score', 'label')
dds4 = get_dataset_dict(dataset4,trn_idxs, val_idxs)
trainer4 = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds4['train'], eval_dataset=dds4['test'], compute_metrics=corr)

  0%|          | 0/37 [00:00<?, ?ba/s]

In [42]:
trainer4.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.028046,0.800561
2,No log,0.025369,0.803122
3,0.005900,0.025404,0.806986
4,0.005900,0.025337,0.807384


TrainOutput(global_step=856, training_loss=0.005717032005853742, metrics={'train_runtime': 252.4846, 'train_samples_per_second': 433.405, 'train_steps_per_second': 3.39, 'total_flos': 639065145139920.0, 'train_loss': 0.005717032005853742, 'epoch': 4.0})

## 方案五 数据预处理 —— 增加一级分类特殊符号

In [43]:
df5 = pd.read_csv(path / 'train.csv')
df5['section'] = df5.context.str[0]
df5['sectok'] = '[' + df5.section + ']'
special_tokens = list(df5['sectok'].unique())
special_tokens

['[A]', '[C]', '[F]', '[H]', '[B]', '[D]', '[E]', '[G]']

In [44]:
tokenizer.add_special_tokens({"additional_special_tokens" : special_tokens})
model.resize_token_embeddings(len(tokenizer))

Embedding(128009, 768)

In [45]:
df5['context_2'] = '[' + df5.context + ']'

input_columns =  ['sectok', 'context_2', 'target', 'anchor']
sep = f' [S] '
df5['inputs'] = mk_inputs(df5, sep, input_columns).str.lower()
df5.head()

Unnamed: 0,id,anchor,target,context,score,section,sectok,context_2,inputs
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,[A],[A47],[a] [s] [a47] [s] abatement of pollution [s] abatement
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,[A],[A47],[a] [s] [a47] [s] act of abating [s] abatement
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,[A],[A47],[a] [s] [a47] [s] active catalyst [s] abatement
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,[A],[A47],[a] [s] [a47] [s] eliminating process [s] abatement
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,[A],[A47],[a] [s] [a47] [s] forest region [s] abatement


In [46]:
dataset5 = Dataset.from_pandas(df5).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs', 'context'])
dataset5 = dataset5.rename_column('score', 'label')
dds5 = get_dataset_dict(dataset5,trn_idxs, val_idxs)
trainer5 = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds5['train'], eval_dataset=dds5['test'], compute_metrics=corr)

  0%|          | 0/37 [00:00<?, ?ba/s]

In [47]:
trainer5.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.025597,0.801202
2,No log,0.025128,0.804418
3,0.006800,0.025082,0.807355
4,0.006800,0.024958,0.808583


TrainOutput(global_step=856, training_loss=0.005916571366452725, metrics={'train_runtime': 283.229, 'train_samples_per_second': 386.359, 'train_steps_per_second': 3.022, 'total_flos': 808938977214240.0, 'train_loss': 0.005916571366452725, 'epoch': 4.0})

# 方案六 分层分组

In [48]:
from sklearn.model_selection import StratifiedGroupKFold

In [49]:
n_folds = 4
cv = StratifiedGroupKFold(n_splits=n_folds, shuffle=True, random_state=42)

In [50]:
idxs = df5.index
scores = 100 * df5.score
folds = list(cv.split(idxs, scores, df5.anchor))
folds

[(array([    0,     1,     2, ..., 36470, 36471, 36472]),
  array([  245,   246,   247, ..., 36369, 36370, 36371])),
 (array([    0,     1,     2, ..., 36470, 36471, 36472]),
  array([   49,    50,    51, ..., 36414, 36415, 36416])),
 (array([    0,     1,     2, ..., 36414, 36415, 36416]),
  array([  266,   267,   268, ..., 36470, 36471, 36472])),
 (array([   49,    50,    51, ..., 36470, 36471, 36472]),
  array([    0,     1,     2, ..., 36057, 36058, 36059]))]

In [51]:
i = 1
for trn_idxs, val_idxs in folds:
    print(f'fold {i}')
    dds = get_dataset_dict(dataset5,trn_idxs, val_idxs)
    trainer = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds['train'], eval_dataset=dds['test'], compute_metrics=corr)
    trainer.train()
    metrics = [o['eval_Pearson'] for o in trainer.state.log_history if 'eval_Pearson' in o]
    metrics[-1]
    i += 1

fold 1


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.009359,0.940145
2,No log,0.008462,0.940799
3,0.008900,0.008471,0.942625
4,0.008900,0.008222,0.943226


fold 2


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.004605,0.970309
2,No log,0.005029,0.970298
3,0.006600,0.004189,0.972367
4,0.006600,0.00411,0.97298


fold 3


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.003681,0.977405
2,No log,0.003028,0.978949
3,0.005200,0.003039,0.980313
4,0.005200,0.002943,0.980831


fold 4


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.002559,0.983906
2,No log,0.002287,0.984822
3,0.004400,0.002344,0.986358
4,0.004400,0.002033,0.986501
