In [None]:
from fastai.imports import *
import warnings, logging

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [None]:
from datasets import Dataset, DatasetDict

# Explorative Data Analysis

In [None]:
path = Path('us-patent-phrase-to-phrase-matching')
path.ls()

In [None]:
df = pd.read_csv(path / 'train.csv')
df

In [None]:
df.describe(include='object')

In [None]:
df.isnull().sum().sum()

In [None]:
df.info()

In [None]:
def value_counts(data, columns):
    for c in columns:
        print(f'\n --- {c} --- \n')
        v = data[c].value_counts()
        print(f'{v}')
        print(f'{c}.max = {v.max()}, {c}.mean = {v.mean()}, {c}.min = {v.min()}')
value_counts(df, ['context', 'anchor', 'target', 'score'])


In [None]:
df.target.str.count(' ').max(), df.anchor.str.count(' ').max()

In [None]:
df.anchor.value_counts().mean(), df.anchor.value_counts().max()

In [None]:
df['section'] = df.context.str[0]

In [None]:
value_counts(df, ['section', 'context', 'anchor', 'score'])

## 数据量与分布小结
- 原始数据 36473 rows × 5 columns，没有空值，没有异常值。
- 其中包含有 2 种分组类型：context, anchor, 和一个数值结果： score
- context 分组数据行数 max = 2186， mean = 344, min = 18     部分数据不足
- anchor 分组数据行数 max = 152, mean = 49, min = 1，        部分数据不足
- score  分组数据行数 max = 12300, mean = 7294, min = 1154， 数据充足

增强 context 分级特征
- section 分组数据行数 max = 8019, mean = 4559, min = 1279, 数据充足

## 字符型数据状况

In [None]:
def has_uppercase(data, columns):
    for c in columns:
        up = any([ch.isupper() for ch in data[c]])
        print(f'{c} {"has" if up else "has no"} upper case')

In [None]:
has_uppercase(df, ['context', 'anchor', 'target'])

只有 context 有 upper case  情况，整体数据都比较干净

# 设计验收数据集

In [None]:
def split_train_valid(data, column):
    u_data = list(data[column].unique())
    np.random.seed(42)
    np.random.shuffle(u_data)
    
    val_prop = 0.25
    val_size = int(len(u_data) * val_prop)
    val_data = u_data[:val_size]
    is_val = data.anchor.isin(val_data)
    
    idxs = np.arange(len(is_val))
    trn_idxs = idxs[~is_val]
    val_idxs = idxs[is_val]
    return trn_idxs, val_idxs

In [None]:
trn_idxs, val_idxs = split_train_valid(df, 'anchor')
len(trn_idxs), len(val_idxs), trn_idxs, val_idxs

In [None]:
from datasets import DatasetDict
def get_dataset_dict(dataset, trn_idxs, val_idxs):
    return DatasetDict({
      "train" : dataset.select(trn_idxs),
      "test" : dataset.select(val_idxs)
    })

## 方案一 数据预处理

In [None]:
from transformers import AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_nm)
tokenizer.all_special_tokens

In [None]:
def tokenize(dataset, projection):
    return dataset.map(lambda x: tokenizer(x[projection]))

In [None]:
def mk_inputs(data_frame, sep, columns):
    return data_frame[columns].apply(lambda x: sep.join(x), axis = 1)

In [None]:
input_columns =  ['context', 'target', 'anchor']
sep = f' {tokenizer.sep_token} '
df['inputs'] = mk_inputs(df, sep, input_columns)
df.head()

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs'])
dataset = dataset.rename_column('score', 'label')
dataset[0]

In [None]:
dds = get_dataset_dict(dataset,trn_idxs, val_idxs)
dds

## 准备 transformer arguments, model and trainer

In [None]:
def corr(eval_preds): return { "Pearson": np.corrcoef(*eval_preds)[0][1] }

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 1)

In [None]:
bs = 128
lr = 8e-5
epochs = 4
args = TrainingArguments(output_dir='outputs', evaluation_strategy='epoch', 
                         weight_decay=0.01, warmup_ratio=0.1,
                         per_device_train_batch_size= bs, per_device_eval_batch_size= bs * 2,
                         num_train_epochs=epochs, lr_scheduler_type='cosine', fp16=True,
                         report_to='none')

In [None]:
trainer = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds['train'], eval_dataset=dds['test'], compute_metrics=corr)

In [None]:
trainer.train()

## 方案二 数据预处理 —— context => [context]

In [None]:
df2 = pd.read_csv(path / 'train.csv')
df2['section'] = df2.context.str[0]
df2['context_2'] = '[' + df2.context + ']'

In [None]:
input_columns =  ['context_2', 'target', 'anchor']
sep = f' {tokenizer.sep_token} '
df2['inputs'] = mk_inputs(df2, sep, input_columns)
df2.head()

In [None]:
dataset2 = Dataset.from_pandas(df2).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs', 'context'])
dataset2 = dataset2.rename_column('score', 'label')
dataset2

In [None]:
dds2 = get_dataset_dict(dataset2,trn_idxs, val_idxs)
dds2

In [None]:
trainer2 = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds2['train'], eval_dataset=dds2['test'], compute_metrics=corr)

In [None]:
trainer2.train()

## 方案三 数据预处理 —— 自定义分隔符 [SEP] => [S]

In [None]:
df3 = pd.read_csv(path / 'train.csv')
df3['section'] = df3.context.str[0]
df3['context_2'] = '[' + df3.context + ']'

input_columns =  ['context_2', 'target', 'anchor']
sep = f' [S] '
df3['inputs'] = mk_inputs(df3, sep, input_columns)
df3.head()

In [None]:
dataset3 = Dataset.from_pandas(df3).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs', 'context'])
dataset3 = dataset3.rename_column('score', 'label')
dataset3

In [None]:
dds3 = get_dataset_dict(dataset3,trn_idxs, val_idxs)
dds3

In [None]:
trainer3 = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds3['train'], eval_dataset=dds3['test'], compute_metrics=corr)

In [None]:
trainer3.train()

## 方案四 数据预处理 —— inputs 小写

In [None]:
df4 = pd.read_csv(path / 'train.csv')
df4['section'] = df4.context.str[0]
df4['context_2'] = '[' + df4.context + ']'

input_columns =  ['context_2', 'target', 'anchor']
sep = f' [S] '
df4['inputs'] = mk_inputs(df4, sep, input_columns).str.lower()
df4.head()

In [None]:
dataset4 = Dataset.from_pandas(df4).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs', 'context'])
dataset4 = dataset4.rename_column('score', 'label')
dds4 = get_dataset_dict(dataset4,trn_idxs, val_idxs)
trainer4 = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds4['train'], eval_dataset=dds4['test'], compute_metrics=corr)

In [None]:
trainer4.train()

## 方案五 数据预处理 —— 增加一级分类特殊符号

In [None]:
df5 = pd.read_csv(path / 'train.csv')
df5['section'] = df5.context.str[0]
df5['sectok'] = '[' + df5.section + ']'
special_tokens = list(df5['sectok'].unique())
special_tokens

In [None]:
tokenizer.add_special_tokens({"additional_special_tokens" : special_tokens})
model.resize_token_embeddings(len(tokenizer))

In [None]:
df5['context_2'] = '[' + df5.context + ']'

input_columns =  ['sectok', 'context_2', 'target', 'anchor']
sep = f' [S] '
df5['inputs'] = mk_inputs(df5, sep, input_columns).str.lower()
df5.head()

In [None]:
dataset5 = Dataset.from_pandas(df5).map(lambda row: tokenizer(row['inputs']), batched=True, remove_columns= input_columns + ['id', 'section', 'inputs', 'context'])
dataset5 = dataset5.rename_column('score', 'label')
dds5 = get_dataset_dict(dataset5,trn_idxs, val_idxs)
trainer5 = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds5['train'], eval_dataset=dds5['test'], compute_metrics=corr)

In [None]:
trainer5.train()

# 方案六 分层分组

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
n_folds = 4
cv = StratifiedGroupKFold(n_splits=n_folds, shuffle=True, random_state=42)

In [None]:
idxs = df5.index
scores = 100 * df5.score
folds = list(cv.split(idxs, scores, df5.anchor))
folds

In [None]:
i = 1
for trn_idxs, val_idxs in folds:
    print(f'fold {i}')
    dds = get_dataset_dict(dataset5,trn_idxs, val_idxs)
    trainer = Trainer(model = model, args=args, tokenizer = tokenizer, train_dataset= dds['train'], eval_dataset=dds['test'], compute_metrics=corr)
    trainer.train()
    metrics = [o['eval_Pearson'] for o in trainer.state.log_history if 'eval_Pearson' in o]
    metrics[-1]
    i += 1