In [None]:
import numpy as np 
import pandas as pd 
import os

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

# basic params

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from statistics import mean

In [None]:
def min_max_mean_sentence_length(text):
    '''
    句子长度：句子包含单词个数
    返回：最大，最小，平均值
    '''
    tokened_sent = sent_tokenize(text)#把一段话分成多个句子，以空格、换行分割
    main_dict = {}
    for item in tokened_sent:
        item1 = list(item.split(" "))#空格分割,把每个句子分成不同单词
        item2 = [' '.join(item1)]#空格合并
        Length = []
        Length.append(len(item1))
        mydict = dict(zip(item2, Length))
        main_dict.update(mydict)

    return max(main_dict.values()), min(main_dict.values()), round(mean(main_dict.values()),3)

In [None]:
def basic_features(_):
    '''
    增加变量：excerpt_len段落中单词加空格长度，单词个数，句子单词个数的最大最小平均 
    '''
    df= _.copy()
    df['excerpt_len'] = df['excerpt'].apply(lambda x : len(x))
    df['excerpt_word_count'] = df['excerpt'].apply(lambda x : len(x.split(' ')))
    df[['max_len_sent','min_len_sent','avg_len_sent']] = df.apply(lambda x: min_max_mean_sentence_length(x['excerpt']),axis=1, result_type='expand')
    return df

In [None]:
train = basic_features(train)
test = basic_features(test)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='target'), train['target'].values, random_state=42,test_size=0.20)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))
features = ['excerpt_len', 'excerpt_word_count', 'min_len_sent', 'max_len_sent', 'avg_len_sent']
import lightgbm as lgb
gbm = lgb.LGBMRegressor(random_state=42)
gbm.fit(X_train[features],y_train,eval_metric='mse')
pred_y = gbm.predict(X_test[features])
print(f' Test RMSE using basic features {round(np.sqrt(mean_squared_error(y_test,pred_y)),4)}')

# 可信度

In [None]:
#https://www.kaggle.com/duboisian/first-draft-model?scriptVersionId=63553418&cellId=2
def GrunningFog(excerpt):
    """
    预测年级 grade level 基于 Grunning Fog index method
    """
    document = excerpt
    document = document.replace('\n',' ').split('.')
    document = [x for x in document if len(x)>1]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = []
    ComplexCount = []
    for sentence in document:
        tokens = nltk.word_tokenize(sentence)#
        words.append(len(tokens))
        tokens = [lemmatizer.lemmatize(x) for x in tokens]#词根还原
        Complex = [1 if syllable_count(token) >=3 else 0 for token in tokens]#单词大于等于三个音节，算为复杂
        ComplexCount.append(np.sum(Complex))#每一句的复杂单词个数
    ASL = np.mean(words) #Average words per sentence 平均单词长度
    PropComplex = np.sum(ComplexCount)/np.sum(words) #proprtion of complex words (>= 3 sylables) 段落复杂单词个数/段落单词个数
    GrunFog = 0.4*(ASL + (100*PropComplex))
    return(GrunFog)

In [None]:
def SMOG(excerpt):
    '''
    文段分行转空格，以.分割成多个句子，
    nltk.word_tokenize处理句子
    '''
    document = excerpt
    document = document.replace('\n',' ').split('.')
    document = [x for x in document if len(x)>1]
    words = []
    ComplexCount = []
    for sentence in document:
        tokens = nltk.word_tokenize(sentence)
        words.append(len(tokens))
        Complex = [1 if syllable_count(token) >=3 else 0 for token in tokens]
        ComplexCount.append(np.sum(Complex))
    SMOGScore = (1.0430 * np.sqrt(np.sum(ComplexCount) * (30/len(words)))) + 3.1291
    return(SMOGScore)

In [None]:
#https://stackoverflow.com/a/46759549
def syllable_count(word):
    '''
    统计单词元音数量
    '''
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [None]:
def asw_asl(_):
    df = _.copy()
    df['ASL'] = df['excerpt'].apply(lambda row: np.sum([len(x.split(' ')) for x in row.replace('\n','').split('.')])/len([len(x.split(' ')) for x in row.replace('\n','').split('.')]))
    df['ASW'] = df['excerpt'].apply(lambda row: np.sum([syllable_count(x) if len(x)>0 else 0 for x in row.replace('\n','').replace('.','').split(' ')])/len([x for x in row.replace('\n','').replace('.','').split(' ')]))
    
    df['RE'] = df.apply(lambda row: 206.835 - (1.015 * row['ASL']) - (84.6 * row['ASW']),axis = 1)
    df['FKRA'] = df.apply(lambda row: (0.39 * row['ASL']) + (11.8 * row['ASW']) -15.59 ,axis = 1)
    df['GrunFog'] = df['excerpt'].apply(lambda row: GrunningFog(row))
    df['SMOG'] = df['excerpt'].apply(lambda row: SMOG(row))
    return df

In [None]:
train = asw_asl(train)
test = asw_asl(test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='target'), train['target'].values, random_state=42,test_size=0.20)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))
features = ['excerpt_len', 'excerpt_word_count', 'min_len_sent', 'max_len_sent', 'avg_len_sent','ASL',
 'ASW','RE','FKRA','GrunFog','SMOG']


gbm = lgb.LGBMRegressor(random_state=42)
gbm.fit(X_train[features],y_train,eval_metric='mse')
pred_y = gbm.predict(X_test[features])
print(f' Test RMSE using basic features {round(np.sqrt(mean_squared_error(y_test,pred_y)),4)}')

# +clbert

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoConfig

from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

In [None]:

# Distilbert
DISTILBERT = '../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased'
# Roberta
ROBERTA = '../input/huggingface-roberta-variants/roberta-base/roberta-base'
BERT = '../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased'

ARCH_PATH = DISTILBERT
if_train=False
cfg={}
cfg['train'] ={
    'n_folds': 5,
    'n_epochs': 100
}
cfg['dl_val'] = {
    'batch_size': 8 if device.type=='cpu' else 64, 
    'shuffle': False, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}
cfg['model'] = {'name': ARCH_PATH}
cfg['tokenizer'] ={'name': ARCH_PATH,'max_length': 210}
class cldataset(Dataset):
    '''
    call时，传入index，将该句分词，
    返回第一部分{ids,mask,token_type_ids}，供bert使用
    第二部分target，作为label
    '''
    def __init__(self,df,tokenizer,max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self,index):
        text = self.df.loc[index, 'excerpt']
        inputs = self.tokenizer.encode_plus(
            text,                                 
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        ids=inputs['input_ids']
        mask=inputs['attention_mask']
        if cfg['tokenizer']['name']=='bert-base-uncased':
            token_type_ids = inputs['token_type_ids'] 
        else:
            token_type_ids = 1.
        target=self.df.loc[index,['target']]
        return {
            'ids': torch.LongTensor(ids),#单词在词典中编码
            'mask': torch.LongTensor(mask),#self-attention操作指定
             'token_type_ids': torch.tensor(token_type_ids)#区分两个句子的编码
            },{
            'target': torch.Tensor(target)
    }
class clbert(nn.Module):
    def __init__(self,name,dropout=True):
        super(clbert, self).__init__()
        self.bert = AutoModel.from_pretrained(name)#导入预训练模型
        self.name = name
        
        if name == BERT:
            self.in_features = self.bert.pooler.dense.out_features
        elif name == DISTILBERT:
            self.in_features = self.bert.transformer.layer[5].output_layer_norm.normalized_shape[0]
        elif name == ROBERTA:
            self.in_features = self.bert.pooler.dense.out_features
        else:
            self.in_features = 768
        
        self.fc = nn.Linear(self.in_features, 1)
        self.dense = nn.Linear(self.in_features, self.in_features)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(p=0.2)
        
        torch.nn.init.kaiming_normal_(self.dense.weight)
        torch.nn.init.kaiming_normal_(self.fc.weight)
        
    def forward(self, ids, mask, token_type_ids):
        if self.name == BERT:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
                                                  token_type_ids=token_type_ids,
                                                  return_dict=False)
        elif self.name == DISTILBERT:
            last_hidden_state = self.bert(ids, 
                                           attention_mask=mask, 
                                           return_dict=False)
            first_token_tensor = last_hidden_state[0][:, 0]
            output = self.dense(first_token_tensor)
            output = self.activation(output)
            
        elif self.name == ROBERTA:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
#                                                   token_type_ids=token_type_ids,
                                                  return_dict=False)
        output = self.dropout(output)
        output = self.fc(output)
        return output
def val_fn_cv(model, dl):
    '''
    用model预测传入的dl数据
    '''
    scaler = GradScaler()
    preds = []
    
    model.eval()
    model.to(device)    
    progress_bar = tqdm(dl, desc='cv')
    
    with torch.no_grad():
        for i, data in enumerate(progress_bar):
            inputs = {key: value.to(device) for key, value in data[0].items()}
            targets = data[1]['target'].to(device)            
            with autocast():
                outputs = model(**inputs)
            preds.append(outputs.detach().cpu().numpy())
    
    preds = np.concatenate(preds)    
    return preds

def main_infer(): 
    '''
    预测test数据
    '''
    df = pd.read_csv(TEST)
    df['target'] = 0.
    
    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        print('Fold:', fold)
        test_ds =cldataset(df, tokenizer=tokenizer,max_len=cfg['tokenizer']['max_length'])
    
        test_dl = DataLoader(test_ds, **cfg['dl_val'])
        #加载训练好的模型
        model = clbert(name=cfg['model']['name'])
        PATH = os.path.join( MODEL_NAME +f'_fold{fold}.tar')
        state_dict = torch.load(PATH, map_location=device)['model']
        model.load_state_dict(state_dict)
        #定义测试集的输入
        inputs = {'model': model,
                  'dl': test_dl}
        #引用了另一个函数
        preds = val_fn_cv(**inputs)
        df['target'] = df['target'] + np.concatenate(preds)
    
    df['target'] = df['target'] / cfg['train']['n_folds']
    return df
MODEL_NAME = 'clbert'
if if_train==False:
    MODEL_NAME='../input/clberttrainingoutputs/clbert'
TEST = '../input/commonlitreadabilityprize/test.csv'
df = main_infer()

In [None]:
df[['id','target']]

In [None]:
f1=pd.read_csv(r'../input/clberttrainingoutputs/oof_df.csv')
# f2=pd.read_csv(r'../input/clberttrainingoutputs/submission.csv')
train_b=train.merge(f1[['id','oof']].rename(columns={'oof':'clbert'}))
# test_b=test.merge(f2[['id','target']].rename(columns={'target':'clbert'}))
test_b=test.merge(df[['id','target']].rename(columns={'target':'clbert'}))

In [None]:
train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_b.drop(columns='target'), train_b['target'].values, random_state=42,test_size=0.2)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

features = ['excerpt_len', 'excerpt_word_count', 'min_len_sent', 'max_len_sent', 'avg_len_sent',#基础变量
            'ASL','ASW','RE','FKRA','GrunFog','SMOG',#预测阅读文章对应年级变量
            'clbert'#文本预测，bert变量
           ]

gbm = lgb.LGBMRegressor(random_state=42)
gbm.fit(X_train[features],y_train,eval_metric='mse')
pred_y = gbm.predict(X_test[features])
print(f' Test RMSE using basic features {round(np.sqrt(mean_squared_error(y_test,pred_y)),4)}')
# 纳入clbert变量，从0.823 降到了0.509

In [None]:
sub_y=gbm.predict(test_b[features])
sample['target'] = sub_y
sample[['id','target']].to_csv('./submission.csv', index=False)