In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


### Install

In [None]:
!pip install --upgrade transformers
!pip install sentencepiece
!pip install textstat
!pip install pyspellchecker
!pip install datasets
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
Colle

### Kaggle Dataset

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c commonlit-evaluate-student-summaries

Downloading commonlit-evaluate-student-summaries.zip to /content
 95% 1.00M/1.05M [00:00<00:00, 1.99MB/s]
100% 1.05M/1.05M [00:00<00:00, 2.08MB/s]


In [None]:
import zipfile
with zipfile.ZipFile("/content/commonlit-evaluate-student-summaries.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")

###Imports

In [None]:
from typing import List
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import logging
import os
import gc
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,EarlyStoppingCallback
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
import textstat
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
from nltk import pos_tag,ne_chunk, word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import Counter
import spacy
import re
from spellchecker import SpellChecker
import lightgbm as lgb

import random
# logging setting

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
disable_progress_bar()
tqdm.pandas()

###Device

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

###Set seed to 42

In [None]:
# set random seed
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(seed=42)

###Configuration

In [None]:
EXP_NUM = 3

class CFG:
    model_name="deberta-v3-base"
    dir_model="microsoft/"
    save_model_path = f'/content/{model_name}-model/exp_{EXP_NUM}'
    learning_rate=0.000016
    weight_decay=0.03
    hidden_dropout_prob=0.0
    attention_probs_dropout_prob=0.0
    num_train_epochs=5
    n_splits=4
    batch_size=12
    random_seed=42
    save_steps=100
    max_length=1024
    early_stopping_patience=25
    augmentations=False

###Read Data

In [None]:
DATA_DIR = "/content/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

###Pre-processor Class

In [None]:
class Preprocessor:
    def __init__(self,model_name: str,dir_model: str) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"{dir_model}{model_name}")
        self.STOP_WORDS = set(stopwords.words('english'))

        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = SpellChecker()

    def count_text_length(self, df: pd.DataFrame, col:str) -> pd.Series:
        """ text length """
        tokenizer=self.tokenizer
        return df[col].progress_apply(lambda x: len(tokenizer.encode(x)))

    #JUST STOP WORDS OVERLAP
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """
        def check_is_stop_word(word):
            return word in self.STOP_WORDS

#         prompt_words = row['prompt_tokens']
#         summary_words = row['summary_tokens']

        prompt_words = list(self.spacy_ner_model.tokenizer(row['prompt_text']))
        summary_words = list(self.spacy_ner_model.tokenizer(row['text']))

        prompt_words = [str(word) for word in prompt_words]
        summary_words = [str(word) for word in summary_words]

        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))

    #REAL OVERLAP WORD
    def word_overlap_count_real(self, row):
        """ intersection(prompt_text, text) """
        def check_is_stop_word_real(word):
            return word not in self.STOP_WORDS

        prompt_words = list(self.spacy_ner_model.tokenizer(row['prompt_text']))
        summary_words = list(self.spacy_ner_model.tokenizer(row['text']))

        prompt_words = [str(word) for word in prompt_words]
        summary_words = [str(word) for word in summary_words]

        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word_real, prompt_words))
            summary_words = list(filter(check_is_stop_word_real, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))


    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int):
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)

        # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
        # original_ngram_freq = Counter(ngrams(original_words, n))
        # summary_ngram_freq = Counter(ngrams(summary_words, n))
        # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

        return len(common_ngrams)

    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)

        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))

        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}


    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):

#         wordlist=text.split()
        wordlist = self.spacy_ner_model.tokenizer(text)
        wordlist = [str(word) for word in wordlist]
        amount_miss = len(list(self.speller.unknown(wordlist)))

        return amount_miss


    def calculate_pos_ratios(self,text):
        pos_tags = pos_tag(nltk.word_tokenize(text))
        pos_counts = Counter(tag for word, tag in pos_tags)
        total_words = len(pos_tags)
        ratios = {tag: count / total_words for tag, count in pos_counts.items()}
        return ratios

    def calculate_sentiment_scores(self,text):
        sid = SentimentIntensityAnalyzer()
        sentiment_scores = sid.polarity_scores(text)
        return sentiment_scores

    def calculate_punctuation_ratios(self,text):
        total_chars = len(text)
        punctuation_counts = Counter(char for char in text if char in '.,!?;:"()[]{}')
        ratios = {char: count / total_chars for char, count in punctuation_counts.items()}
        return ratios

    def calculate_keyword_density(self,row):
        keywords = set(row['prompt_text'].split())
        text_words = row['text'].split()
        keyword_count = sum(1 for word in text_words if word in keywords)
        return keyword_count / len(text_words)


    def run(self,prompts: pd.DataFrame,summaries:pd.DataFrame,mode:str) -> pd.DataFrame:

        # before merge preprocess

#         prompts["prompt_length"] = prompts["prompt_text"].apply(
#             lambda x: len(self.tokenizer.encode(x))
#         )

        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(list(self.spacy_ner_model.tokenizer(x)))
        )

        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x),
                skip_special_tokens=True
            )
        )

#         summaries["summary_length"] = summaries["text"].apply(
#             lambda x: len(self.tokenizer.encode(x))
#         )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(list(self.spacy_ner_model.tokenizer(x)))
        )

        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x),
                skip_special_tokens=True
            )

        )
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)

        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']

        #stop words overlap
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)


        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1
        )
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )

#         # Crate dataframe with count of each category NERs overlap for all the summaries
#         # Because it spends too much time for this feature, I don't use this time.
#         ners_count_df  = input_df.progress_apply(
#             lambda row: pd.Series(self.ner_overlap_count(row, mode=mode), dtype='float64'), axis=1
#         ).fillna(0)
#         self.ner_keys = ners_count_df.columns
#         ners_count_df['sum'] = ners_count_df.sum(axis=1)
#         ners_count_df.columns = ['NER_' + col for col in ners_count_df.columns]
#         # join ner count dataframe with train dataframe
#         input_df = pd.concat([input_df, ners_count_df], axis=1)

        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)

        #Additional

        #real overlap words
        input_df['real_word_overlap_count'] = input_df.progress_apply(self.word_overlap_count_real, axis=1)

        input_df['sentence_length'] = input_df['text'].progress_apply(lambda x: len(x.split('.')))
        input_df['vocabulary_richness'] = input_df['text'].progress_apply(lambda x: len(set(x.split())))
        input_df['avg_word_length'] = input_df['text'].progress_apply(lambda x: np.mean([len(word) for word in x.split()]))
        input_df['comma_count'] = input_df['text'].progress_apply(lambda x: x.count(','))
        input_df['semicolon_count'] = input_df['text'].progress_apply(lambda x: x.count(';'))

        input_df['pos_ratios'] = input_df['text'].progress_apply(self.calculate_pos_ratios)
        input_df['pos_mean'] = input_df['pos_ratios'].progress_apply(lambda x: np.mean(list(x.values())))

        input_df['sentiment_scores'] = input_df['text'].progress_apply(self.calculate_sentiment_scores)

        sentiment_columns = pd.DataFrame(list(input_df['sentiment_scores']))
        input_df = pd.concat([input_df, sentiment_columns], axis=1)

        input_df['exclamation_count'] = input_df['text'].progress_apply(lambda x: x.count('!'))
        input_df['question_count'] = input_df['text'].progress_apply(lambda x: x.count('?'))
        input_df['quote_count'] = input_df['text'].progress_apply(lambda x: x.count('"'))

        input_df['punctuation_ratios'] = input_df['text'].progress_apply(self.calculate_punctuation_ratios)
        input_df['punctuation_sum'] = input_df['punctuation_ratios'].progress_apply(lambda x: np.sum(list(x.values())))

        input_df['keyword_density'] = input_df.progress_apply(self.calculate_keyword_density, axis=1)

        input_df['sentiment_scores_prompt'] = input_df['prompt_text'].progress_apply(self.calculate_sentiment_scores)

        sentiment_columns_prompt = pd.DataFrame(list(input_df['sentiment_scores_prompt']))
        sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]

        input_df = pd.concat([input_df, sentiment_columns_prompt], axis=1)

        input_df['jaccard_similarity'] = input_df.progress_apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)


        ###########TEXTSTAT FEARURES#############
        input_df['flesch_reading_ease'] = input_df['text'].progress_apply(lambda x: textstat.flesch_reading_ease(x))
        input_df['flesch_kincaid_grade'] = input_df['text'].progress_apply(lambda x: textstat.flesch_kincaid_grade(x))
        input_df['gunning_fog'] = input_df['text'].progress_apply(lambda x: textstat.gunning_fog(x))
        #input_df['smog_index'] = input_df['text'].progress_apply(lambda x: textstat.smog_index(x))
        input_df['automated_readability_index'] = input_df['text'].progress_apply(lambda x: textstat.automated_readability_index(x))
        input_df['coleman_liau_index'] = input_df['text'].progress_apply(lambda x: textstat.coleman_liau_index(x))
        input_df['linsear_write_formula'] = input_df['text'].progress_apply(lambda x: textstat.linsear_write_formula(x))
        input_df['dale_chall_readability_score'] = input_df['text'].progress_apply(lambda x: textstat.dale_chall_readability_score(x))
        input_df['text_standard'] = input_df['text'].progress_apply(lambda x: textstat.text_standard(x,float_output=True))
        input_df['spache_readability'] = input_df['text'].progress_apply(lambda x: textstat.spache_readability(x))
        input_df['mcalpine_eflaw'] = input_df['text'].progress_apply(lambda x: textstat.mcalpine_eflaw(x))
        input_df['reading_time'] = input_df['text'].progress_apply(lambda x: textstat.reading_time(x))
        input_df['syllable_count'] = input_df['text'].progress_apply(lambda x: textstat.syllable_count(x))
        input_df['polysyllabcount'] = input_df['text'].progress_apply(lambda x: textstat.polysyllabcount(x))
        input_df['monosyllabcount'] = input_df['text'].progress_apply(lambda x: textstat.monosyllabcount(x))


        return input_df.drop(columns=["summary_tokens", "prompt_tokens","pos_ratios","sentiment_scores","punctuation_ratios","sentiment_scores_prompt"])

preprocessor = Preprocessor(model_name=CFG.model_name,dir_model=CFG.dir_model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

###Train dataset creation

In [None]:
# train = preprocessor.run(prompts_train, summaries_train, mode="train")
# test = preprocessor.run(prompts_test, summaries_test, mode="test")

train = pd.read_csv("/content/drive/MyDrive/CommonLit/commont-lit-train.csv")

train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,...,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,text_standard,spache_readability,mcalpine_eflaw,reading_time,syllable_count,polysyllabcount,monosyllabcount
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,2,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,9.04,8.375,7.76,8.0,4.54,22.0,4.2,93,7,40
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,55,1,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",...,4.3,13.0,6.44,8.0,5.0,38.5,2.84,56,0,48
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,275,3,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,9.92,11.2,8.32,9.0,4.95,26.8,16.69,317,14,170
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,32,3,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,10.11,4.5,11.63,5.0,3.39,11.3,1.95,37,4,18
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,236,15,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,10.43,6.625,8.24,10.0,4.32,20.2,14.98,301,21,136


###Add Augmentations if True

In [None]:
if CFG.augmentations:
    train_aug = pd.read_csv("/content/drive/MyDrive/CommonLit/train_back_translation.csv")
    train_aug = train_aug.drop(train_aug.columns[11:],axis=1)
    train_aug = train_aug.drop(['summary_length','splling_err_num','prompt_length'],axis=1)
    train_aug['text'] = train_aug['text'].apply(lambda x: x[2:-2])
    train_aug['augmentation'] = 'yes'
    train['augmentation'] = 'no'
    train = pd.concat([train,train_aug],axis=0)
    train = train.reset_index()

###Group K-Fold

In [None]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

###Metrics computation

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)

    return (content_score + wording_score)/2

###Loss defenition

In [None]:
def mcrmse_loss(y_true, y_pred):
    colwise_mse = torch.mean(torch.square(y_true - y_pred), dim=0)
    return torch.mean(torch.sqrt(colwise_mse), dim=0)

# def mse_loss()
#     pass

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        if "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        outputs = model(**inputs)
        loss = mcrmse_loss(labels, outputs['logits'])

        return (loss, outputs) if return_outputs else loss

###Regressor

In [None]:
class ScoreRegressor:
    def __init__(self,model_name: str,dir_model:str, model_dir: str,inputs: List[str],target_cols: List[str],hidden_dropout_prob: float,
                attention_probs_dropout_prob: float, max_length: int,):

        self.input_col = "input" # col name of model input after text concat sep token
        self.input_text_cols = inputs
        self.target_cols = target_cols
        self.model_name = model_name
        self.dir_model = dir_model
        self.model_dir = model_dir
        self.max_length = max_length

        self.tokenizer = AutoTokenizer.from_pretrained(f"{dir_model}{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"{dir_model}{model_name}")

        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 2,
            "problem_type": "regression",
        })

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )

    def concatenate_with_sep_token(self, row):
        sep = " " + self.tokenizer.sep_token + " "
        return sep.join(row[self.input_text_cols])

    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples["content"], examples["wording"]]
        tokenized = self.tokenizer(examples[self.input_col],
                        padding="max_length",
                        truncation=True,
                        max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }

    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                        padding="max_length",
                        truncation=True,
                        max_length=self.max_length)
        return tokenized

    def train(self,fold: int,train_df: pd.DataFrame,valid_df: pd.DataFrame,batch_size: int,learning_rate: float,
            weight_decay: float,num_train_epochs: float,save_steps: int,) -> None:
        """fine-tuning"""

        train_df[self.input_col] = train_df.apply(self.concatenate_with_sep_token, axis=1)
        valid_df[self.input_col] = valid_df.apply(self.concatenate_with_sep_token, axis=1)

        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]

        model = AutoModelForSequenceClassification.from_pretrained(
            f"{self.dir_model}{self.model_name}",
            config=self.model_config
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False)

        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold))

        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            overwrite_output_dir=True,
            do_train=True,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="mcrmse",
            save_total_limit=1,
            fp16=True,
            auto_find_batch_size=True,
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_mcrmse,
            data_collator=self.data_collator,
            callbacks = [EarlyStoppingCallback(early_stopping_patience=CFG.early_stopping_patience)]
        )

        trainer.train()

        # model.save_pretrained(self.model_dir)
        # self.tokenizer.save_pretrained(self.model_dir)

        model.cpu()
        del model
        gc.collect()
        torch.cuda.empty_cache()


    def predict(self,test_df: pd.DataFrame,batch_size: int,fold: int,):
        """predict content score"""

        test_df[self.input_col] = test_df.apply(self.concatenate_with_sep_token, axis=1)

        test_dataset = Dataset.from_pandas(test_df[[self.input_col]], preserve_index=False)
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)



        checkpoint_folder = os.listdir(CFG.save_model_path+f"/fold_{str(fold)}/{str(fold)}")

        model_fold_dir = os.path.join(CFG.save_model_path+f"/fold_{str(fold)}/{str(fold)}",checkpoint_folder[0])

        model = AutoModelForSequenceClassification.from_pretrained(model_fold_dir)
        model.eval()

        # e.g. "bert/fold_0/"

        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train=False,
            do_predict=True,
            per_device_eval_batch_size=batch_size,
            dataloader_drop_last=False,
            fp16=True,
            auto_find_batch_size=True,
        )

        # init trainer
        infer_content = CustomTrainer(
                      model = model,
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]
        pred_df = pd.DataFrame(
            preds,
            columns=[
                f"content_pred",
                f"wording_pred"
           ]
        )

        model.cpu()
        del model
        gc.collect()
        torch.cuda.empty_cache()

        return pred_df

In [None]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        dir_model:str,
        targets: List[str],
        inputs: List[str],
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    # if os.path.exists(model_name):
    #     shutil.rmtree(model_name)

    os.makedirs(CFG.save_model_path)

    for fold in range(n_splits):
        print(f"fold {fold}:")

        train_data = train_df[train_df["fold"] != fold]
        if CFG.augmentations:
            valid_data = train_df[train_df["fold"] == fold]
            valid_data = valid_data[valid_data['augmentation']=='no']
        else:
            valid_data = train_df[train_df["fold"] == fold]

        model_dir =  CFG.save_model_path + f"/fold_{fold}"

        csr = ScoreRegressor(
            model_name=model_name,
            dir_model=dir_model,
            target_cols=targets,
            inputs= inputs,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )

        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data,
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    mode: str,
    targets: List[str],
    inputs: List[str],
    save_each_model: bool,
    n_splits: int,
    batch_size: int,
    model_name: str,
    dir_model:str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""

    columns = list(train_df.columns.values)

    for fold in range(n_splits):
        print(f"fold {fold}:")

        valid_data = train_df[train_df["fold"] == fold]

        model_dir =  f"{model_name}/fold_{fold}"

        csr = ScoreRegressor(
            model_name=model_name,
            dir_model=dir_model,
            target_cols=targets,
            inputs= inputs,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )

        pred_df = csr.predict(
            test_df=valid_data,
            batch_size=batch_size,
            fold=fold
        )

        train_df.loc[valid_data.index, f"content_{mode}_pred"] = pred_df[f"content_pred"].values
        train_df.loc[valid_data.index, f"wording_{mode}_pred"] = pred_df[f"wording_pred"].values

    return train_df[columns + [f"content_{mode}_pred", f"wording_{mode}_pred"]]

###Train and Validation

In [None]:
targets = ["wording", "content"]
mode = "multi"
input_cols = ["prompt_title", "prompt_question", "text"]
model_cfg = CFG

train_by_fold(
    train,
    model_name=model_cfg.model_name,
    dir_model=model_cfg.dir_model,
    save_each_model=False,
    targets=targets,
    inputs=input_cols,
    learning_rate=model_cfg.learning_rate,
    hidden_dropout_prob=model_cfg.hidden_dropout_prob,
    attention_probs_dropout_prob=model_cfg.attention_probs_dropout_prob,
    weight_decay=model_cfg.weight_decay,
    num_train_epochs=model_cfg.num_train_epochs,
    n_splits=CFG.n_splits,
    batch_size=model_cfg.batch_size,
    save_steps=model_cfg.save_steps,
    max_length=model_cfg.max_length
)

if CFG.augmentations:
    train = train[train['augmentation']=='no']
    train = train.drop('augmentation',axis=1)

train = validate(
    train,
    mode=mode,
    targets=targets,
    inputs=input_cols,
    save_each_model=False,
    n_splits=CFG.n_splits,
    batch_size=model_cfg.batch_size,
    model_name=model_cfg.model_name,
    dir_model=model_cfg.dir_model,
    hidden_dropout_prob=model_cfg.hidden_dropout_prob,
    attention_probs_dropout_prob=model_cfg.attention_probs_dropout_prob,
    max_length=model_cfg.max_length
)

# set validate result
for target in ["content", "wording"]:
    rmse = mean_squared_error(train[target], train[f"{target}_{mode}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

fold 0:


Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

deberta.embeddings.word_embeddings.weight False
deberta.embeddings.LayerNorm.weight False
deberta.embeddings.LayerNorm.bias False
deberta.encoder.layer.0.attention.self.query_proj.weight False
deberta.encoder.layer.0.attention.self.query_proj.bias False
deberta.encoder.layer.0.attention.self.key_proj.weight False
deberta.encoder.layer.0.attention.self.key_proj.bias False
deberta.encoder.layer.0.attention.self.value_proj.weight False
deberta.encoder.layer.0.attention.self.value_proj.bias False
deberta.encoder.layer.0.attention.output.dense.weight False
deberta.encoder.layer.0.attention.output.dense.bias False
deberta.encoder.layer.0.attention.output.LayerNorm.weight False
deberta.encoder.layer.0.attention.output.LayerNorm.bias False
deberta.encoder.layer.0.intermediate.dense.weight False
deberta.encoder.layer.0.intermediate.dense.bias False
deberta.encoder.layer.0.output.dense.weight False
deberta.encoder.layer.0.output.dense.bias False
deberta.encoder.layer.0.output.LayerNorm.weight Fa

Step,Training Loss,Validation Loss,Content Rmse,Wording Rmse,Mcrmse
100,No log,0.759219,0.787772,0.762141,0.774956
200,No log,0.666016,0.632377,0.729749,0.681063
300,No log,0.569413,0.575235,0.598637,0.586936
400,No log,0.521616,0.500598,0.572558,0.536578
500,0.607600,0.457823,0.410337,0.532622,0.47148
600,0.607600,0.488606,0.414557,0.590692,0.502625
700,0.607600,0.500115,0.509321,0.521387,0.515354
800,0.607600,0.459572,0.421155,0.524301,0.472728
900,0.607600,0.520324,0.445163,0.620327,0.532745
1000,0.494900,0.496489,0.423983,0.597824,0.510903


fold 1:
deberta.embeddings.word_embeddings.weight False
deberta.embeddings.LayerNorm.weight False
deberta.embeddings.LayerNorm.bias False
deberta.encoder.layer.0.attention.self.query_proj.weight False
deberta.encoder.layer.0.attention.self.query_proj.bias False
deberta.encoder.layer.0.attention.self.key_proj.weight False
deberta.encoder.layer.0.attention.self.key_proj.bias False
deberta.encoder.layer.0.attention.self.value_proj.weight False
deberta.encoder.layer.0.attention.self.value_proj.bias False
deberta.encoder.layer.0.attention.output.dense.weight False
deberta.encoder.layer.0.attention.output.dense.bias False
deberta.encoder.layer.0.attention.output.LayerNorm.weight False
deberta.encoder.layer.0.attention.output.LayerNorm.bias False
deberta.encoder.layer.0.intermediate.dense.weight False
deberta.encoder.layer.0.intermediate.dense.bias False
deberta.encoder.layer.0.output.dense.weight False
deberta.encoder.layer.0.output.dense.bias False
deberta.encoder.layer.0.output.LayerNorm.w

Step,Training Loss,Validation Loss,Content Rmse,Wording Rmse,Mcrmse
100,No log,0.837719,0.831307,0.880622,0.855965
200,No log,0.944562,0.893945,1.037032,0.965488
300,No log,0.880874,0.792384,1.02977,0.911077
400,No log,0.668143,0.616966,0.766841,0.691903
500,0.580200,0.891393,0.832154,1.00845,0.920302
600,0.580200,0.708584,0.551116,0.935249,0.743183
700,0.580200,0.680756,0.541622,0.888972,0.715297
800,0.580200,0.671859,0.566346,0.835363,0.700854
900,0.580200,0.81151,0.689029,1.001501,0.845265
1000,0.489500,0.739923,0.65322,0.889739,0.77148


fold 2:
deberta.embeddings.word_embeddings.weight False
deberta.embeddings.LayerNorm.weight False
deberta.embeddings.LayerNorm.bias False
deberta.encoder.layer.0.attention.self.query_proj.weight False
deberta.encoder.layer.0.attention.self.query_proj.bias False
deberta.encoder.layer.0.attention.self.key_proj.weight False
deberta.encoder.layer.0.attention.self.key_proj.bias False
deberta.encoder.layer.0.attention.self.value_proj.weight False
deberta.encoder.layer.0.attention.self.value_proj.bias False
deberta.encoder.layer.0.attention.output.dense.weight False
deberta.encoder.layer.0.attention.output.dense.bias False
deberta.encoder.layer.0.attention.output.LayerNorm.weight False
deberta.encoder.layer.0.attention.output.LayerNorm.bias False
deberta.encoder.layer.0.intermediate.dense.weight False
deberta.encoder.layer.0.intermediate.dense.bias False
deberta.encoder.layer.0.output.dense.weight False
deberta.encoder.layer.0.output.dense.bias False
deberta.encoder.layer.0.output.LayerNorm.w

Step,Training Loss,Validation Loss,Content Rmse,Wording Rmse,Mcrmse
100,No log,0.582203,0.580814,0.617328,0.599071
200,No log,0.60662,0.494394,0.751267,0.622831
300,No log,0.602472,0.595484,0.63575,0.615617
400,No log,0.499979,0.496317,0.536321,0.516319
500,0.591900,0.479097,0.430395,0.556901,0.493648
600,0.591900,0.467174,0.443633,0.520188,0.48191
700,0.591900,0.494703,0.440496,0.579908,0.510202
800,0.591900,0.567375,0.631065,0.532054,0.58156
900,0.591900,0.569502,0.536409,0.635999,0.586204
1000,0.494500,0.475091,0.456596,0.520141,0.488369


fold 3:
deberta.embeddings.word_embeddings.weight False
deberta.embeddings.LayerNorm.weight False
deberta.embeddings.LayerNorm.bias False
deberta.encoder.layer.0.attention.self.query_proj.weight False
deberta.encoder.layer.0.attention.self.query_proj.bias False
deberta.encoder.layer.0.attention.self.key_proj.weight False
deberta.encoder.layer.0.attention.self.key_proj.bias False
deberta.encoder.layer.0.attention.self.value_proj.weight False
deberta.encoder.layer.0.attention.self.value_proj.bias False
deberta.encoder.layer.0.attention.output.dense.weight False
deberta.encoder.layer.0.attention.output.dense.bias False
deberta.encoder.layer.0.attention.output.LayerNorm.weight False
deberta.encoder.layer.0.attention.output.LayerNorm.bias False
deberta.encoder.layer.0.intermediate.dense.weight False
deberta.encoder.layer.0.intermediate.dense.bias False
deberta.encoder.layer.0.output.dense.weight False
deberta.encoder.layer.0.output.dense.bias False
deberta.encoder.layer.0.output.LayerNorm.w

Step,Training Loss,Validation Loss,Content Rmse,Wording Rmse,Mcrmse
100,No log,0.804601,0.676987,1.001172,0.83908
200,No log,0.793156,0.606983,1.026703,0.816843
300,No log,0.735604,0.579552,0.935168,0.75736
400,No log,0.780764,0.683488,0.93691,0.810199
500,0.582400,0.608297,0.554995,0.725414,0.640204
600,0.582400,0.63787,0.619111,0.726524,0.672818
700,0.582400,0.857584,0.798801,0.961055,0.879928
800,0.582400,0.687353,0.649398,0.774096,0.711747
900,0.582400,0.693795,0.6305,0.804809,0.717655
1000,0.501700,0.572538,0.486484,0.710456,0.59847


fold 0:


fold 1:


fold 2:


fold 3:


cv content rmse: 0.46173521257928285
cv wording rmse: 0.6468330711904262


###Results

In [None]:
train.to_csv(CFG.save_model_path+f"/train-predictions_{EXP_NUM}.csv",index=False)
train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,...,text_standard,spache_readability,mcalpine_eflaw,reading_time,syllable_count,polysyllabcount,monosyllabcount,fold,content_multi_pred,wording_multi_pred
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,2,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,8.0,4.54,22.0,4.2,93,7,40,3.0,0.060547,0.817871
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,55,1,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",...,8.0,5.0,38.5,2.84,56,0,48,2.0,-0.770508,-0.442871
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,275,3,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,9.0,4.95,26.8,16.69,317,14,170,1.0,2.558594,2.748047
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,32,3,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,...,5.0,3.39,11.3,1.95,37,4,18,1.0,-0.699219,-0.434326
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,236,15,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,...,10.0,4.32,20.2,14.98,301,21,136,3.0,1.975586,2.103516


In [None]:
os.makedirs(f"/content/drive/MyDrive/CommonLit/{CFG.model_name}",exist_ok=True)

In [None]:
train.to_csv(f"/content/drive/MyDrive/CommonLit/{CFG.model_name}/train-predictions_{EXP_NUM}.csv",index=False)

In [None]:
f = open(f"/content/drive/MyDrive/CommonLit/{CFG.model_name}/results_{EXP_NUM}.txt", "w")

for target in ["content", "wording"]:
    rmse = mean_squared_error(train[target], train[f"{target}_{mode}_pred"], squared=False)
    f.write(f"cv {target} rmse: {rmse}")

###Save model

In [None]:
!zip -r /content/drive/MyDrive/CommonLit/deberta-v3-base/exp_3.zip  /content/deberta-v3-base-model

  adding: content/bert-large-cased-model/ (stored 0%)
  adding: content/bert-large-cased-model/exp_1/ (stored 0%)
  adding: content/bert-large-cased-model/exp_1/fold_3/ (stored 0%)
  adding: content/bert-large-cased-model/exp_1/fold_3/3/ (stored 0%)
  adding: content/bert-large-cased-model/exp_1/fold_3/3/checkpoint-1800/ (stored 0%)
  adding: content/bert-large-cased-model/exp_1/fold_3/3/checkpoint-1800/trainer_state.json (deflated 76%)
  adding: content/bert-large-cased-model/exp_1/fold_3/3/checkpoint-1800/config.json (deflated 52%)
  adding: content/bert-large-cased-model/exp_1/fold_3/3/checkpoint-1800/rng_state.pth (deflated 28%)
  adding: content/bert-large-cased-model/exp_1/fold_3/3/checkpoint-1800/pytorch_model.bin (deflated 7%)
  adding: content/bert-large-cased-model/exp_1/fold_3/3/checkpoint-1800/vocab.txt (deflated 49%)
  adding: content/bert-large-cased-model/exp_1/fold_3/3/checkpoint-1800/training_args.bin (deflated 48%)
  adding: content/bert-large-cased-model/exp_1/fold_3

###LGBM Model

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text",
                "prompt_question", "prompt_title",
                "prompt_text","prompt_length",
                "avg_word_length","semicolon_count","neg","neu","pos","compound",
                "exclamation_count","question_count","punctuation_sum","neg_prompt","neu_prompt","pos_prompt",
                "compound_prompt","flesch_reading_ease","flesch_kincaid_grade","gunning_fog","automated_readability_index",
                "coleman_liau_index","linsear_write_formula","dale_chall_readability_score","text_standard","spache_readability",
                "mcalpine_eflaw"
               ] + targets

In [None]:
model_dict = {}

for target in targets:
    models = []

    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
                  'boosting_type': 'gbdt',
                  'random_state': 42,
                  'objective': 'regression',
                  'metric': 'rmse',
                  'learning_rate': 0.048,
                  'lambda_l1': 0.0,
                  'lambda_l2': 0.011
                  }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)

    model_dict[target] = models

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3004
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 21
[LightGBM] [Info] Start training from score 0.017606
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.412134
Early stopping, best iteration is:
[70]	train's rmse: 0.410412
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2926
[LightGBM] [Info] Number of data points in the train set: 5156, number of used features: 21
[LightGBM] [Info] Start training from score -0.039959
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[42]	train's rmse: 0.529247
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2927
[LightGBM] [Info] Number of data points in the train set: 5169, number of used features: 21
[LightGBM] [Info] Start training from score 0.013356
Training un

### CV Score

In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []

    for fold, model in enumerate(models):
        # ilocで取り出す行を指定
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)

    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    f.write(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")
f.write(f"mcrmse : {sum(rmses) / len(rmses)}")
f.close()

content_rmse : 0.4804265147960582
wording_rmse : 0.6033187479833971
mcrmse : 0.5418726313897276
