### Installs

In [1]:
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl" #pyspell

import sys
sys.path.append("/kaggle/input/textstat-install-mit/package") ##Textstat
import textstat

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


### Imports

In [2]:
from typing import List
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import logging
import os
import gc
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from collections import Counter
import spacy
import re
from spellchecker import SpellChecker
import lightgbm as lgb



import nltk
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import ne_chunk, word_tokenize, pos_tag
# logging setting 

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()



### Seed to 42

In [3]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    
seed_everything(seed=42)

### Device

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Read data

In [5]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

### Pre-processor

In [6]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = SpellChecker() #Speller(lang='en')
        
    def count_text_length(self, df: pd.DataFrame, col:str) -> pd.Series:
        """ text length """
        tokenizer=self.tokenizer
        return df[col].progress_apply(lambda x: len(tokenizer.encode(x)))

    #JUST STOP WORDS OVERLAP
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
#         prompt_words = row['prompt_tokens']
#         summary_words = row['summary_tokens']
                
        prompt_words = list(self.spacy_ner_model.tokenizer(row['prompt_text']))
        summary_words = list(self.spacy_ner_model.tokenizer(row['text']))
        
        prompt_words = [str(word) for word in prompt_words]
        summary_words = [str(word) for word in summary_words]
    
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    #REAL OVERLAP WORD
    def word_overlap_count_real(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word_real(word):
            return word not in self.STOP_WORDS
        
        prompt_words = list(self.spacy_ner_model.tokenizer(row['prompt_text']))
        summary_words = list(self.spacy_ner_model.tokenizer(row['text']))
        
        prompt_words = [str(word) for word in prompt_words]
        summary_words = [str(word) for word in summary_words]
        
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word_real, prompt_words))
            summary_words = list(filter(check_is_stop_word_real, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
    
    
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int):
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)

        # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
        # original_ngram_freq = Counter(ngrams(original_words, n))
        # summary_ngram_freq = Counter(ngrams(summary_words, n))
        # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
#         wordlist=text.split()
        wordlist = self.spacy_ner_model.tokenizer(text)
        wordlist = [str(word) for word in wordlist]
        amount_miss = len(list(self.speller.unknown(wordlist)))

        return amount_miss
    
    
    def calculate_pos_ratios(self,text):
        pos_tags = pos_tag(nltk.word_tokenize(text))
        pos_counts = Counter(tag for word, tag in pos_tags)
        total_words = len(pos_tags)
        ratios = {tag: count / total_words for tag, count in pos_counts.items()}
        return ratios
    
    def calculate_sentiment_scores(self,text):
        sid = SentimentIntensityAnalyzer()
        sentiment_scores = sid.polarity_scores(text)
        return sentiment_scores
    
    def calculate_punctuation_ratios(self,text):
        total_chars = len(text)
        punctuation_counts = Counter(char for char in text if char in '.,!?;:"()[]{}')
        ratios = {char: count / total_chars for char, count in punctuation_counts.items()}
        return ratios
    
    def calculate_keyword_density(self,row):
        keywords = set(row['prompt_text'].split())
        text_words = row['text'].split()
        keyword_count = sum(1 for word in text_words if word in keywords)
        return keyword_count / len(text_words)
    
    
    def run(self,prompts: pd.DataFrame,summaries:pd.DataFrame,mode:str) -> pd.DataFrame:
        
        # before merge preprocess
        
#         prompts["prompt_length"] = prompts["prompt_text"].apply(
#             lambda x: len(self.tokenizer.encode(x))
#         )
        
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(list(self.spacy_ner_model.tokenizer(x)))
        )
        
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )
        )

#         summaries["summary_length"] = summaries["text"].apply(
#             lambda x: len(self.tokenizer.encode(x))
#         )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(list(self.spacy_ner_model.tokenizer(x)))
        )
    
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )

        )
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)

        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        #stop words overlap
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        
        
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        
#         # Crate dataframe with count of each category NERs overlap for all the summaries
#         # Because it spends too much time for this feature, I don't use this time.
#         ners_count_df  = input_df.progress_apply(
#             lambda row: pd.Series(self.ner_overlap_count(row, mode=mode), dtype='float64'), axis=1
#         ).fillna(0)
#         self.ner_keys = ners_count_df.columns
#         ners_count_df['sum'] = ners_count_df.sum(axis=1)
#         ners_count_df.columns = ['NER_' + col for col in ners_count_df.columns]
#         # join ner count dataframe with train dataframe
#         input_df = pd.concat([input_df, ners_count_df], axis=1)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        #Additional
        
        #real overlap words
        input_df['real_word_overlap_count'] = input_df.progress_apply(self.word_overlap_count_real, axis=1)
        
        input_df['sentence_length'] = input_df['text'].progress_apply(lambda x: len(x.split('.')))
        input_df['vocabulary_richness'] = input_df['text'].progress_apply(lambda x: len(set(x.split())))
        input_df['avg_word_length'] = input_df['text'].progress_apply(lambda x: np.mean([len(word) for word in x.split()]))
        input_df['comma_count'] = input_df['text'].progress_apply(lambda x: x.count(','))
        input_df['semicolon_count'] = input_df['text'].progress_apply(lambda x: x.count(';'))
        
        input_df['pos_ratios'] = input_df['text'].progress_apply(self.calculate_pos_ratios)
        input_df['pos_mean'] = input_df['pos_ratios'].progress_apply(lambda x: np.mean(list(x.values())))
        
        input_df['sentiment_scores'] = input_df['text'].progress_apply(self.calculate_sentiment_scores)
        
        sentiment_columns = pd.DataFrame(list(input_df['sentiment_scores']))
        input_df = pd.concat([input_df, sentiment_columns], axis=1)
        
        input_df['exclamation_count'] = input_df['text'].progress_apply(lambda x: x.count('!'))
        input_df['question_count'] = input_df['text'].progress_apply(lambda x: x.count('?'))
        input_df['quote_count'] = input_df['text'].progress_apply(lambda x: x.count('"'))
        
        input_df['punctuation_ratios'] = input_df['text'].progress_apply(self.calculate_punctuation_ratios)
        input_df['punctuation_sum'] = input_df['punctuation_ratios'].progress_apply(lambda x: np.sum(list(x.values())))
        
        input_df['keyword_density'] = input_df.progress_apply(self.calculate_keyword_density, axis=1)

        input_df['sentiment_scores_prompt'] = input_df['prompt_text'].progress_apply(self.calculate_sentiment_scores)

        sentiment_columns_prompt = pd.DataFrame(list(input_df['sentiment_scores_prompt']))
        sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]
        
        input_df = pd.concat([input_df, sentiment_columns_prompt], axis=1)
        
        input_df['jaccard_similarity'] = input_df.progress_apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)
        
        
        ###########TEXTSTAT FEARURES#############
        input_df['flesch_reading_ease'] = input_df['text'].progress_apply(lambda x: textstat.flesch_reading_ease(x))
        input_df['flesch_kincaid_grade'] = input_df['text'].progress_apply(lambda x: textstat.flesch_kincaid_grade(x))
        input_df['gunning_fog'] = input_df['text'].progress_apply(lambda x: textstat.gunning_fog(x))
        #input_df['smog_index'] = input_df['text'].progress_apply(lambda x: textstat.smog_index(x))
        input_df['automated_readability_index'] = input_df['text'].progress_apply(lambda x: textstat.automated_readability_index(x))
        input_df['coleman_liau_index'] = input_df['text'].progress_apply(lambda x: textstat.coleman_liau_index(x))
        input_df['linsear_write_formula'] = input_df['text'].progress_apply(lambda x: textstat.linsear_write_formula(x))
        input_df['dale_chall_readability_score'] = input_df['text'].progress_apply(lambda x: textstat.dale_chall_readability_score(x))
        input_df['text_standard'] = input_df['text'].progress_apply(lambda x: textstat.text_standard(x,float_output=True))
        input_df['spache_readability'] = input_df['text'].progress_apply(lambda x: textstat.spache_readability(x))
        input_df['mcalpine_eflaw'] = input_df['text'].progress_apply(lambda x: textstat.mcalpine_eflaw(x))
        input_df['reading_time'] = input_df['text'].progress_apply(lambda x: textstat.reading_time(x))
        input_df['syllable_count'] = input_df['text'].progress_apply(lambda x: textstat.syllable_count(x))
        input_df['polysyllabcount'] = input_df['text'].progress_apply(lambda x: textstat.polysyllabcount(x))
        input_df['monosyllabcount'] = input_df['text'].progress_apply(lambda x: textstat.monosyllabcount(x))
      
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens","pos_ratios","sentiment_scores","punctuation_ratios","sentiment_scores_prompt"])
    
preprocessor = Preprocessor(model_name="debertav3base")

### Create Test NLP features

In [7]:
train = pd.read_csv("/kaggle/input/common-lit-train/commont-lit-train.csv")
gkf = GroupKFold(n_splits=4)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

test = preprocessor.run(prompts_test, summaries_test, mode="test")

100%|██████████| 4/4 [00:00<00:00, 2594.28it/s]
100%|██████████| 4/4 [00:00<00:00, 2097.41it/s]
100%|██████████| 4/4 [00:00<00:00, 3919.91it/s]
100%|██████████| 4/4 [00:00<00:00, 3980.36it/s]
100%|██████████| 4/4 [00:00<00:00, 3494.53it/s]
100%|██████████| 4/4 [00:00<00:00, 905.70it/s]
100%|██████████| 4/4 [00:00<00:00, 6864.65it/s]
100%|██████████| 4/4 [00:00<00:00, 9505.50it/s]
100%|██████████| 4/4 [00:00<00:00, 7781.64it/s]
100%|██████████| 4/4 [00:00<00:00, 10356.31it/s]
100%|██████████| 4/4 [00:00<00:00, 4687.68it/s]
100%|██████████| 4/4 [00:00<00:00, 22.92it/s]
100%|██████████| 4/4 [00:00<00:00, 5862.06it/s]
100%|██████████| 4/4 [00:00<00:00, 66.89it/s]
100%|██████████| 4/4 [00:00<00:00, 7667.83it/s]
100%|██████████| 4/4 [00:00<00:00, 8089.30it/s]
100%|██████████| 4/4 [00:00<00:00, 8460.52it/s]
100%|██████████| 4/4 [00:00<00:00, 6870.28it/s]
100%|██████████| 4/4 [00:00<00:00, 6878.73it/s]
100%|██████████| 4/4 [00:00<00:00, 3953.16it/s]
100%|██████████| 4/4 [00:00<00:00, 77.66it/s

In [8]:
test

Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,length_ratio,...,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,text_standard,spache_readability,mcalpine_eflaw,reading_time,syllable_count,polysyllabcount,monosyllabcount
0,000000ffffff,abc123,Example text 1,3,0,Summarize...,Example Title 1,Heading\nText...,4,0.75,...,-2.38,1.5,19.58,6.0,4.13,4.0,0.18,5,1,2
1,111111eeeeee,def789,Example text 2,3,0,Summarize...,Example Title 2,Heading\nText...,4,0.75,...,-2.38,1.5,19.58,6.0,4.13,4.0,0.18,5,1,2
2,222222cccccc,abc123,Example text 3,3,0,Summarize...,Example Title 1,Heading\nText...,4,0.75,...,-2.38,1.5,19.58,6.0,4.13,4.0,0.18,5,1,2
3,333333dddddd,def789,Example text 4,3,0,Summarize...,Example Title 2,Heading\nText...,4,0.75,...,-2.38,1.5,19.58,6.0,4.13,4.0,0.18,5,1,2


### Regressor - Predict Only

In [9]:
class ScoreRegressor:
    def __init__(self, 
                model_dir: str,
                inputs: List[str],
                target_cols: List[str],
                max_length: int,
                ):
        
        self.input_col = "input" # col name of model input after text concat sep token
        self.input_text_cols = inputs
        self.target_cols = target_cols
        self.model_dir = model_dir
        self.max_length = max_length
        
        checkpoint = os.listdir(self.model_dir)[0]
        path_to_checkpoint = os.path.join(self.model_dir,checkpoint)
        
        self.tokenizer = AutoTokenizer.from_pretrained(path_to_checkpoint)
        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )

    def concatenate_with_sep_token(self, row):
        sep = " " + self.tokenizer.sep_token + " "        
        return sep.join(row[self.input_text_cols])

    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples["content"], examples["wording"]]
        tokenized = self.tokenizer(examples[self.input_col],
                        padding="max_length",
                        truncation=True,
                        max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                        padding="max_length",
                        truncation=True,
                        max_length=self.max_length)
        return tokenized
        

        
    def predict(self, 
                test_df: pd.DataFrame,
                batch_size: int,
                fold: int,
               ):
        """predict content score"""
        
        test_df[self.input_col] = test_df.apply(self.concatenate_with_sep_token, axis=1)

        test_dataset = Dataset.from_pandas(test_df[[self.input_col]], preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)
        
        checkpoint = os.listdir(self.model_dir)[0]
        path_to_checkpoint = os.path.join(self.model_dir,checkpoint)
        
        model_config = AutoConfig.from_pretrained(path_to_checkpoint)
        model = AutoModelForSequenceClassification.from_pretrained(path_to_checkpoint,config=model_config)
        model.eval()
        test_args = TrainingArguments(
            output_dir='/kaggle/working/',
            do_train=False,
            do_predict=True,
            per_device_eval_batch_size=batch_size,
            dataloader_drop_last=False,
            fp16=True,
            auto_find_batch_size=True,
        )

        # init trainer
        infer_content = Trainer(
                      model = model, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]
        pred_df = pd.DataFrame(
            preds, 
            columns=[
                f"content_pred", 
                f"wording_pred"
           ]
        )
        
        model.cpu()
        del model
        gc.collect()
        torch.cuda.empty_cache()

        return pred_df

### Predict function

In [10]:
def predict(
    test: pd.DataFrame,
    mode: str,
    targets:List[str],
    inputs: List[str],
    n_splits: int,
    batch_size: int,
    max_length : int,
    path_to_folds: str,
    model_col_num: int,
    ):
    """predict using mean folds"""
    
    test_df = test.copy()
    columns = list(test_df.columns.values)
    
    
    for fold in range(n_splits):
        print(f"fold {fold}:")
        
        model_dir =  f"{path_to_folds}/fold_{fold}/{fold}"

        csr = ScoreRegressor(
            model_dir = model_dir,
            target_cols=targets,
            inputs= inputs,
            max_length=max_length,
           )
        
        pred_df = csr.predict(
            test_df=test_df, 
            batch_size=batch_size,
            fold=fold
        )
        
        
        test_df[f"content_{mode}_pred_{fold}"] = pred_df[f"content_pred"].values
        test_df[f"wording_{mode}_pred_{fold}"] = pred_df[f"wording_pred"].values

    test_df[f"content_{mode}_pred_{model_col_num}"] = test_df[[f"content_{mode}_pred_{fold}" for fold in range(n_splits)]].mean(axis=1)
    test_df[f"wording_{mode}_pred_{model_col_num}"] = test_df[[f"wording_{mode}_pred_{fold}" for fold in range(n_splits)]].mean(axis=1)
    
    return test_df[[f"content_{mode}_pred_{model_col_num}", f"wording_{mode}_pred_{model_col_num}"]]



def validate(
    train: pd.DataFrame,
    mode: str,
    targets: List[str],
    inputs: List[str],
    n_splits: int,
    batch_size: int,
    path_to_folds: str,
    model_col_num: int,
    max_length : int,
    ):
    
    train_df = train.copy()
    columns = list(train_df.columns.values)

    for fold in range(n_splits):
        print(f"fold {fold}:")

        valid_data = train_df[train_df["fold"] == fold]
        
        model_dir =  f"{path_to_folds}/fold_{fold}/{fold}"
        
        csr = ScoreRegressor(
            model_dir = model_dir,
            target_cols=targets,
            inputs= inputs,
            max_length=max_length,
           )

        pred_df = csr.predict(
            test_df=valid_data,
            batch_size=batch_size,
            fold=fold
        )

        train_df.loc[valid_data.index, f"content_{mode}_pred_{model_col_num}"] = pred_df[f"content_pred"].values
        train_df.loc[valid_data.index, f"wording_{mode}_pred_{model_col_num}"] = pred_df[f"wording_pred"].values

    return train_df[[f"content_{mode}_pred_{model_col_num}", f"wording_{mode}_pred_{model_col_num}"]]

### Models Paths

In [11]:
# ALBERT_V2_PATH = "/kaggle/input/albert-v3-base-exp1/content/albert-base-v2-model/exp_1"
ALL_MPNET_BASE_V2_PATH = "/kaggle/input/all-mpnet-base-v2-exp1/exp_1/content/all-mpnet-base-v2-model/exp_1"
DEBERTA_V3_BASE_PATH = "/kaggle/input/deberta-v3-base-exp1/exp_1/content/deberta-v3-base-model/exp_1"
# ELECTRA_BASE_DISCRIMINATOR_PATH = "/kaggle/input/electra-base-discriminator-exp1/content/electra-base-discriminator/exp_1"
FUNNEL_MEDIUM_BASE_PATH = "/kaggle/input/funnel-medium-base-exp1/exp_1/content/medium-base-model/exp_1"
# ROBERTA_BASE_SQUAD2_PATH = "/kaggle/input/roberta-base-squad2-exp1/content/roberta-base-squad2/exp_1"
XLM_ROBERTA_BASE_PATH = "/kaggle/input/xlm-roberta-base-exp1/exp_1/content/xlm-roberta-base-model/exp_1"
DEBERTA_V3_LARGE_PATH = "/kaggle/input/deberta-v3-large-exp1/exp_1/content/deberta-v3-large-model/exp_1"
XLM_ROBERTA_LARGE_PATH = "/kaggle/input/xlm-roberta-large-exp1/exp_1/content/xlm-roberta-large-model/exp_1"


model_paths = [ALL_MPNET_BASE_V2_PATH,FUNNEL_MEDIUM_BASE_PATH,XLM_ROBERTA_BASE_PATH,DEBERTA_V3_LARGE_PATH,XLM_ROBERTA_LARGE_PATH,DEBERTA_V3_BASE_PATH]

### Predict on Train

In [12]:
targets = ["wording", "content"]
input_cols = ["prompt_title", "prompt_question", "text"]

ensemble_df_train = pd.DataFrame({})
for i,model_path in enumerate(model_paths): 
    print(model_path)
    batch_size=12
    if 'large' in model_path:
        batch_size=10
    train_exp = validate(
        train,
        mode="multi",
        targets=targets,
        inputs=input_cols,
        batch_size=batch_size,
        n_splits=4,
        max_length=512,
        path_to_folds=model_path,
        model_col_num=i,
    )
    ensemble_df_train = pd.concat([ensemble_df_train,train_exp],axis=1)

/kaggle/input/all-mpnet-base-v2-exp1/exp_1/content/all-mpnet-base-v2-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/funnel-medium-base-exp1/exp_1/content/medium-base-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/xlm-roberta-base-exp1/exp_1/content/xlm-roberta-base-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/deberta-v3-large-exp1/exp_1/content/deberta-v3-large-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/xlm-roberta-large-exp1/exp_1/content/xlm-roberta-large-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/deberta-v3-base-exp1/exp_1/content/deberta-v3-base-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


### Predict on Test

In [13]:
targets = ["wording", "content"]
input_cols = ["prompt_title", "prompt_question", "text"]

ensemble_df = pd.DataFrame({})
for i,model_path in enumerate(model_paths): 
    print(model_path)
    batch_size=12
    if 'large' in model_path:
        batch_size=10
    test_exp = predict(
        test,
        mode="multi",
        targets=targets,
        inputs=input_cols,
        batch_size=12,
        n_splits=4,
        max_length=512,
        path_to_folds=model_path,
        model_col_num=i,
    )
    ensemble_df = pd.concat([ensemble_df,test_exp],axis=1)

/kaggle/input/all-mpnet-base-v2-exp1/exp_1/content/all-mpnet-base-v2-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/funnel-medium-base-exp1/exp_1/content/medium-base-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/xlm-roberta-base-exp1/exp_1/content/xlm-roberta-base-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/deberta-v3-large-exp1/exp_1/content/deberta-v3-large-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/xlm-roberta-large-exp1/exp_1/content/xlm-roberta-large-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


/kaggle/input/deberta-v3-base-exp1/exp_1/content/deberta-v3-base-model/exp_1
fold 0:


fold 1:


fold 2:


fold 3:


### Regular Ensemble

In [14]:
final_ensemble_train_df = pd.DataFrame({})
final_ensemble_train_df['pred_content'] = ensemble_df_train.filter(regex=(f'content')).apply(np.mean,axis=1)
final_ensemble_train_df['pred_wording'] = ensemble_df_train.filter(regex=(f'wording')).apply(np.mean,axis=1)

final_ensemble_df = pd.DataFrame({})
final_ensemble_df['pred_content'] = ensemble_df.filter(regex=(f'content')).apply(np.mean,axis=1)
final_ensemble_df['pred_wording'] = ensemble_df.filter(regex=(f'wording')).apply(np.mean,axis=1)

train = pd.concat([train,final_ensemble_train_df],axis=1)
test = pd.concat([test,final_ensemble_df],axis=1)

### LGBM

In [15]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text",
                "prompt_question", "prompt_title",
                "prompt_text","prompt_length",
                "avg_word_length","semicolon_count","neg","neu","pos","compound",
                "exclamation_count","question_count","punctuation_sum","neg_prompt","neu_prompt","pos_prompt",
                "compound_prompt","flesch_reading_ease","flesch_kincaid_grade","gunning_fog","automated_readability_index",
                "coleman_liau_index","linsear_write_formula","dale_chall_readability_score","text_standard","spache_readability",
                "mcalpine_eflaw"
               ] + targets

In [16]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(4):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
                  'boosting_type': 'gbdt',
                  'random_state': 42,
                  'objective': 'regression',
                  'metric': 'rmse',
                  'learning_rate': 0.048,
                  'lambda_l1': 0.0,
                  'lambda_l2': 0.011
                  }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)
    
    model_dict[target] = models

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3004
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 21
[LightGBM] [Info] Start training from score 0.017606
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[51]	train's rmse: 0.395312
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2926
[LightGBM] [Info] Number of data points in the train set: 5156, number of used features: 21
[LightGBM] [Info] Start training from score -0.039959
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.47581
Early stopping, best iteration is:
[142]	train's rmse: 0.475182
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2927
[LightGBM] [Info] Number of data points in the train set: 5169, number of used features: 21


In [17]:
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        # ilocで取り出す行を指定
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.4328681846407399
wording_rmse : 0.5664064483417809
mcrmse : 0.4996373164912604


### Predict on Test and Submission

In [18]:
drop_columns = ["student_id", "prompt_id", "text",
                "prompt_question", "prompt_title",
                "prompt_text","prompt_length",
                "avg_word_length","semicolon_count","neg","neu","pos","compound",
                "exclamation_count","question_count","punctuation_sum","neg_prompt","neu_prompt","pos_prompt",
                "compound_prompt","flesch_reading_ease","flesch_kincaid_grade","gunning_fog","automated_readability_index",
                "coleman_liau_index","linsear_write_formula","dale_chall_readability_score","text_standard","spache_readability",
                "mcalpine_eflaw"
               ]

In [19]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        # ilocで取り出す行を指定
        X_eval_cv = test.drop(columns=drop_columns)

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [20]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(4)]].mean(axis=1)

In [21]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)