<a href="https://colab.research.google.com/github/snassimr/CommonLit/blob/master/CommonLit_V10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%reload_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)

Mounted at /content/gdrive


In [3]:
SYS_PROJECT_DIR = '/content/gdrive/MyDrive/Colab Notebooks/CommonLit'
SYS_VERSION_DIR = '/content/gdrive/MyDrive/Colab Notebooks/CommonLit/V10'
SYS_OUTPUT_DIR  = '/content/gdrive/MyDrive/Colab Notebooks/CommonLit/V10'
SYS_LLM_DIR     = '/content/gdrive/MyDrive/Colab Notebooks/CommonLit/llm'

In [4]:
import os
import pandas as pd
import shutil

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
# shutil.rmtree('test123')

# Explore environment

In [7]:
# import psutil
# import platform
# uname = platform.uname()
# print(f"System: {uname.system}")  #Windows or Linux
# print(f"Node Name: {uname.node}") # System name
# print(f"Release: {uname.release}") # OS release version like  10(Windows) or 5.4.0-72-generic(linux)
# print(f"Version: {uname.version}")
# print(f"Machine: {uname.machine}")  # machine can be AMD64 or x86-64
# print(f"Processor: {uname.processor}") #  Intel64 Family 6 or x86_64
# print("Physical cores:", psutil.cpu_count(logical=False))
# print("Total cores:", psutil.cpu_count(logical=True))

# def get_size(bytes, suffix="B"):
#     """
#     Scale bytes to its proper format- KB, MB, GB, TB and PB
#     """
#     factor = 1024
#     for unit in ["", "K", "M", "G", "T", "P"]:
#         if bytes < factor:
#             return f"{bytes:.2f}{unit}{suffix}"
#         bytes /= factor

# print("Virtual memory")
# svmem = psutil.virtual_memory()
# print(f"Total: {get_size(svmem.total)}")
# print(f"Available: {get_size(svmem.available)}")
# print(f"Used: {get_size(svmem.used)}")

# Read Data

## Train Data

In [8]:
prompts_train   = pd.read_csv(os.path.join(SYS_PROJECT_DIR,'prompts_train.csv'))
summaries_train = pd.read_csv(os.path.join(SYS_PROJECT_DIR,'summaries_train.csv'))

In [9]:
print(f"Prompts train shape: {prompts_train.shape}")
display(prompts_train.head())
print(f"Summary train shape: {summaries_train.shape}")
display(summaries_train.head())

Prompts train shape: (4, 4)


Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


Summary train shape: (7165, 5)


Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


## Test Data

In [10]:
prompts_test   = pd.read_csv(os.path.join(SYS_PROJECT_DIR,'prompts_test.csv'))
summaries_test = pd.read_csv(os.path.join(SYS_PROJECT_DIR,'summaries_test.csv'))

In [11]:
print(f"Prompts test shape: {prompts_test.shape}")
display(prompts_test.head())
print(f"Summary test shape: {summaries_test.shape}")
display(summaries_test.head())

Prompts test shape: (2, 4)


Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,abc123,Summarize...,Example Title 1,Heading\nText...
1,def789,Summarize...,Example Title 2,Heading\nText...


Summary test shape: (4, 3)


Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4


# Set Seed

In [12]:
# set random seed
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(seed=42)

# Model Config

## Initialize Model_Config

In [13]:
Model_Config = {
    'model_name' : 'debertav3base',
    'base_model' : {
      'root_dir'      : 'base_model',
      'model_name'    : 'debertav3base',
      'num_labels'    : 1,
      'problem_type'  : 'regression',
      'hidden_dropout_prob' : 0.005,
      'attention_probs_dropout_prob' : 0.005,
      'max_length' : 512,
      'save_steps' : 100
    },
    'llm' : {
        'model_dir' : 'meta-llama_Llama-2-7b-chat-hf',
        'max_new_tokens' : 200
    },
    'model_root_dir': 'model',
    'random_seed' : 42,
    'n_splits'    : 4,
    'num_train_epochs' : 5,
    'learning_rate' : 1.5e-5,
    'batch_size' : 16,
    'weight_decay'  : 0.02
}

# Load Models

In [14]:
!pip install accelerate==0.20.3
!pip install transformers==4.30.2
!pip install datasets
!pip install sentencepiece

Collecting accelerate==0.20.3
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3
Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.30.2)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.2)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 

## Load llm model

In [15]:
import os
bitsandbytes_path = os.path.join(SYS_PROJECT_DIR, "bitsandbytes-0.41.1-py3-none-any.whl")
!pip install "{bitsandbytes_path}"

Processing ./gdrive/MyDrive/Colab Notebooks/CommonLit/bitsandbytes-0.41.1-py3-none-any.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.1


In [16]:
import torch, accelerate
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from datasets import Dataset

import os
llm_model_dir = Model_Config['llm']['model_dir']
llm_model_local_dir = os.path.join(SYS_LLM_DIR, llm_model_dir)


compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

n_gpus = torch.cuda.device_count()
max_memory = f'{16384}MB'
llm_model = AutoModelForCausalLM.from_pretrained(
          llm_model_local_dir, quantization_config=bnb_config,
          device_map={"": 0},
          # device_map = "auto",  max_memory = {i: max_memory for i in range(n_gpus)}
          )
llm_model.config.use_cache = False
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_local_dir , use_fast=True)
llm_tokenizer.pad_token = llm_tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /content/gdrive/MyDrive/Colab Notebooks/CommonLit/llm/meta-llama_Llama-2-7b-chat-hf and are newly initialized: ['model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn

## Load Deberta model

## Download and save base model

In [17]:
# import transformers
# from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

# base_model = AutoModel.from_pretrained(f"microsoft/deberta-v3-base")
# base_model_tokenizer = AutoTokenizer.from_pretrained(f"microsoft/deberta-v3-base")
# base_model_config = AutoConfig.from_pretrained(f"microsoft/deberta-v3-base")
# base_model_content = AutoModelForSequenceClassification.from_pretrained(f"microsoft/deberta-v3-base", config = base_model_config)

# import os
# base_model_root_dir = Model_Config['base_model']['root_dir']
# base_model_save_directory = os.path.join(SYS_PROJECT_DIR, base_model_root_dir)

# # Save model
# base_model.save_pretrained(base_model_save_directory)
# # Save tokenizer
# base_model_tokenizer.save_pretrained(base_model_save_directory)
# # Save config
# base_model_config.save_pretrained(base_model_save_directory)

## Load base model

In [18]:
import os
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

base_model_root_dir = Model_Config['base_model']['root_dir']
base_model_save_directory = os.path.join(SYS_PROJECT_DIR, base_model_root_dir)

base_model = AutoModel.from_pretrained(base_model_save_directory)
base_model_tokenizer = AutoTokenizer.from_pretrained(base_model_save_directory)
base_model_config  = AutoConfig.from_pretrained(base_model_save_directory)


## Update base model

In [19]:
base_model_config.update(
    Model_Config['base_model']
)

## Update Model_Config

In [20]:
base_model_content = AutoModelForSequenceClassification.from_pretrained(base_model_save_directory, config = base_model_config)
Model_Config['base_model']['base_model'] = base_model
Model_Config['base_model']['tokenizer']  = base_model_tokenizer
Model_Config['base_model']['config']     = base_model_config
Model_Config['base_model']['model_content'] = base_model_content

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /content/gdrive/MyDrive/Colab Notebooks/CommonLit/base_model and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
################################################################

# Text Feature Engineering

## NLP

In [22]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
import spacy
import re
from tqdm import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
import os
pyspellchecker_path = os.path.join(SYS_PROJECT_DIR, "pyspellchecker-0.7.2-py3-none-any.whl")
!pip install "{pyspellchecker_path}"

Processing ./gdrive/MyDrive/Colab Notebooks/CommonLit/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [24]:
import os
autocorrect_path = os.path.join(SYS_PROJECT_DIR, "autocorrect-2.6.1.tar")
!pip install "{autocorrect_path}"

Processing ./gdrive/MyDrive/Colab Notebooks/CommonLit/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=48482cdbadae5c261baaa108fef63f0f8693892e0771eacadedcb7021aba9753
  Stored in directory: /root/.cache/pip/wheels/ab/ef/d4/824805febc9b7df81c228822efe02c2bc31f1d8b9f2f0c83b8
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


In [25]:
!pip install swifter

Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16507 sha256=2c100443fb08ae3a64135d93516c0a9c9a7a93f5a1d21641a6baae2bf622194a
  Stored in directory: /root/.cache/pip/wheels/e4/cf/51/0904952972ee2c7aa3709437065278dc534ec1b8d2ad41b443
Successfully built swifter
Installing collected packages: swifter
Successfully installed swifter-1.4.0


In [26]:
class Preprocessor:
    def __init__(self,
                tokenizer: AutoTokenizer,
                ) -> None:

        from spellchecker import SpellChecker
        from autocorrect import Speller
        import swifter

        self.tokenizer  = tokenizer
        self.STOP_WORDS = set(stopwords.words('english'))

        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spell_checker = SpellChecker() #Speller(lang='en')

    def text_length(self, df: pd.DataFrame, col:str) -> pd.Series:
        """ text length """
        tokenizer=self.tokenizer
        return df[col].progress_apply(lambda x: len(tokenizer.encode(x)))

    def word_overlap_length(self, row):
        """ word overlap length between prompt_text and text """
        def check_is_stop_word(word):
            return word in self.STOP_WORDS

        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))

    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int):
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)

        # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
        # original_ngram_freq = Counter(ngrams(original_words, n))
        # summary_ngram_freq = Counter(ngrams(summary_words, n))
        # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

        return len(common_ngrams)

    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def misspelling_count(self, text):

        wordlist=text.split()
        misspelling_count = len(list(self.spell_checker.unknown(wordlist)))

        return misspelling_count

    def run(self,
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:

        # Before merge preprocess
        # prompts
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x),
                skip_special_tokens=True
            )
        )

        # summaries
        summaries["corrected_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x),
                skip_special_tokens=True
            )

        )
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.misspelling_count)

        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']

        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_length, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1
        )
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )

        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)

        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])

In [27]:
preprocessor = Preprocessor(tokenizer = base_model_tokenizer)

train = preprocessor.run(prompts_train, summaries_train, mode = "train")
test = preprocessor.run(prompts_test, summaries_test, mode = "test")

100%|██████████| 7165/7165 [09:18<00:00, 12.83it/s]
100%|██████████| 7165/7165 [00:00<00:00, 8195.35it/s]
100%|██████████| 7165/7165 [00:00<00:00, 10958.58it/s]
100%|██████████| 7165/7165 [00:01<00:00, 4077.63it/s]
100%|██████████| 7165/7165 [00:02<00:00, 3404.34it/s]
100%|██████████| 7165/7165 [00:00<00:00, 79859.77it/s]
100%|██████████| 4/4 [00:00<00:00, 7342.33it/s]
100%|██████████| 4/4 [00:00<00:00, 10611.77it/s]
100%|██████████| 4/4 [00:00<00:00, 2438.55it/s]
100%|██████████| 4/4 [00:00<00:00, 3855.06it/s]
100%|██████████| 4/4 [00:00<00:00, 3714.24it/s]
100%|██████████| 4/4 [00:00<00:00, 4188.02it/s]


## LLM based features

In [28]:
def create_prompt(df : pd.DataFrame):

    from tqdm import trange

    prompt_instruction = f"""
    ### Instruction:
    Answer Question with information from Background.
    Answer should be concise as possible.
    """

    prompts = list()
    for i in trange(len(df.index)):

        prompt = prompt_instruction + \
        f"""
        ### Background ###
        [{df.iloc[i].prompt_text}]
        ### Question ###
        [{df.iloc[i].prompt_question}]

        ### Answer ###
        """

        prompts.append(prompt)

    return prompts

def generate_prompt_answers(df , model, tokenizer, llm_config):

    import pandas as pd
    from transformers.generation import GenerationConfig

    max_new_tokens = llm_config['max_new_tokens']
    gen_answers = []

    for index, row in df.iterrows():

        prompt = str(row['prompt_gen_answer'])
        input = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to("cuda:0")
        gen_cfg = GenerationConfig.from_model_config(model.config)
        gen_cfg.max_new_tokens = max_new_tokens
        outputs = model.generate(**input, generation_config = gen_cfg)
        gen_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        keyword = "### Answer ###\n"
        k_index = gen_answer.find(keyword)
        if index != -1:
          gen_answer = gen_answer[k_index + len(keyword):].strip()
        else:
          gen_answer = ""
        gen_answers.append(gen_answer)

    return gen_answers

In [29]:
prompts_train_llm = prompts_train[['prompt_id']]
prompts_train_llm['prompt_gen_answer'] = create_prompt(prompts_train)
prompts_train_llm['llm_gen_answer']    = generate_prompt_answers(prompts_train_llm, llm_model, llm_tokenizer, Model_Config['llm'])
train = train.merge(prompts_train_llm[['prompt_id', 'llm_gen_answer']], how="left", on="prompt_id")

100%|██████████| 4/4 [00:00<00:00, 3303.90it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prompts_train_llm['prompt_gen_answer'] = create_prompt(prompts_train)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prompts_train_llm['llm_gen_answer']    = generate_prompt_answers(prompts_train_llm, llm_model, llm_tokenizer, Model_Config['llm'])


In [30]:
prompts_test_llm = prompts_test[['prompt_id']]
prompts_test_llm['prompt_gen_answer'] = create_prompt(prompts_test)
prompts_test_llm['llm_gen_answer']    = generate_prompt_answers(prompts_test_llm, llm_model, llm_tokenizer, Model_Config['llm'])
test = test.merge(prompts_test_llm[['prompt_id', 'llm_gen_answer']], how="left", on="prompt_id")

100%|██████████| 2/2 [00:00<00:00, 2316.66it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prompts_test_llm['prompt_gen_answer'] = create_prompt(prompts_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prompts_test_llm['llm_gen_answer']    = generate_prompt_answers(prompts_test_llm, llm_model, llm_tokenizer, Model_Config['llm'])


In [31]:
import gc
llm_model.cpu()
del model
gc.collect()
torch.cuda.empty_cache()

380

In [32]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [33]:
!pip install -q -U sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [34]:
from sentence_transformers import SentenceTransformer, util

model_st_name = 'sentence-transformers/all-mpnet-base-v2'
model_st_local_dir = os.path.join(SYS_PROJECT_DIR, 'all-mpnet-base-v2')

model_st = SentenceTransformer(model_st_local_dir)

In [35]:
def calculate_similarity(df , text1, text2, model):

    from sentence_transformers import SentenceTransformer, util
    from scipy.stats import pearsonr

    cos_sim_list = []

    gen_answers = list(df[text1])
    texts       = list(df[text2])

    embeddings_1 = model.encode(gen_answers, convert_to_tensor=True)
    embeddings_2 = model.encode(texts, convert_to_tensor=True)

    for embedding_1, embedding_2 in zip(embeddings_1, embeddings_2):
        cos_sim = util.pytorch_cos_sim(embedding_1, embedding_2).item()
        cos_sim_list.append(cos_sim)

    return cos_sim_list

In [36]:
train['cos_sim_1']  = calculate_similarity(train, 'llm_gen_answer', 'text', model_st)

In [37]:
train['cos_sim_2']  = calculate_similarity(train, 'llm_gen_answer', 'corrected_text', model_st)

In [38]:
test['cos_sim_1']  = calculate_similarity(test, 'llm_gen_answer', 'text', model_st)

In [39]:
test['cos_sim_2']  = calculate_similarity(test, 'llm_gen_answer', 'corrected_text', model_st)

## Prompts similarity

In [40]:
# prompts_train_sim = prompts_train[['prompt_id']]
# prompts_train_sim['prompt_sim_1'] = calculate_similarity(prompts_train, "prompt_title", "prompt_question", model_st)
# prompts_train_sim['prompt_sim_2'] = calculate_similarity(prompts_train, "prompt_question", "prompt_text", model_st)
# prompts_train_sim['prompt_sim_3'] = calculate_similarity(prompts_train, "prompt_title", "prompt_text", model_st)
# train = train.merge(prompts_train_sim, how="left", on="prompt_id")

In [41]:
# prompts_test_sim = prompts_test[['prompt_id']]
# prompts_test_sim['prompt_sim_1'] = calculate_similarity(prompts_test, "prompt_title", "prompt_question", model_st)
# prompts_test_sim['prompt_sim_2'] = calculate_similarity(prompts_test, "prompt_question", "prompt_text", model_st)
# prompts_test_sim['prompt_sim_3'] = calculate_similarity(prompts_test, "prompt_title", "prompt_text", model_st)
# test = test.merge(prompts_test_sim, how="left", on="prompt_id")

## Save data

In [42]:
import dill

dill_data = dict()
dill_data['train'] = train ; dill_data['test'] = test
dill_data_filename = os.path.join(SYS_PROJECT_DIR, 'dill_data')
with open(dill_data_filename, "wb") as dill_file:
    dill.dump(dill_data, dill_file)

## Load data

In [43]:
import dill

dill_data_filename = os.path.join(SYS_PROJECT_DIR, 'dill_data')
with open(dill_data_filename, "rb") as dill_file:
  dill_data = dill.load(dill_file)
train = dill_data['train'] ; test = dill_data['test']

# Modeling

## Create folds

In [44]:
from sklearn.model_selection import KFold, GroupKFold

gkf = GroupKFold(n_splits = Model_Config['n_splits'])

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

## Define evaluation metrics

In [45]:
def compute_metrics(eval_pred):

    """
    Calculates rmse as a evaluation metric to extend Trainer logging with additional metrics along with loss
    """

    from sklearn.metrics import mean_squared_error

    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """

    import numpy as np

    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):

    from sklearn.metrics import mean_squared_error

    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)

    return (content_score + wording_score)/2

## Model (regression)

## class Model

In [46]:
class Model:
    def __init__(self,
                model_config : dict,
                model_dir: str,
                target: str
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "text"]
        self.input_col = "input"

        self.text_cols = [self.input_col]
        self.target = target
        self.target_cols = [target]

        self.model_config = model_config
        self.model_name = model_config['model_name']
        self.model_dir = model_dir

        self.base_tokenizer    = model_config['base_model']['tokenizer']
        self.base_model_config = model_config['base_model']['config']

        self.max_length = model_config['base_model']['max_length']

        seed_everything(seed=42)

        from transformers import DataCollatorWithPadding

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.base_tokenizer
        )

    def train_tokenize(self, examples: pd.DataFrame):
      labels = [examples[self.target]]
      tokenized = self.base_tokenizer(examples[self.input_col],
                      padding=False,
                      truncation=True,
                      max_length=self.max_length)
      return {
            **tokenized,
            "labels": labels,
      }

    def test_tokenize(self, examples: pd.DataFrame):
        tokenized = self.base_tokenizer(examples[self.input_col],
                        padding=False,
                        truncation=True,
                        max_length=self.max_length)
        return tokenized


    def train(self,
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            model_config : dict
        ) -> None:
        """fine-tuning"""

        import pandas as pd
        from datasets import Dataset, load_dataset, load_from_disk
        from transformers import TrainingArguments, Trainer

        num_train_epochs =  model_config['num_train_epochs']
        learning_rate    =  model_config['learning_rate']
        batch_size       =  model_config['batch_size']
        weight_decay     =  model_config['weight_decay']
        save_steps       =  model_config['base_model']['save_steps']

        sep = self.base_tokenizer.sep_token

        self.text_col = "corrected_text" if target=="content" else "text"


        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep
                    + train_df["prompt_question"] + sep
                    + train_df[self.text_col]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep
                    + valid_df["prompt_question"] + sep
                    + valid_df[self.text_col]
                  )

        # train_df[self.input_col] = (
        #             train_df["prompt_title"] + sep
        #             + train_df["prompt_question"] + sep
        #             + train_df["llm_gen_answer"] + sep
        #             + train_df[self.text_col]
        #           )

        # valid_df[self.input_col] = (
        #             valid_df["prompt_title"] + sep
        #             + valid_df["prompt_question"] + sep
        #             + valid_df["llm_gen_answer"] + sep
        #             + valid_df[self.text_col]
        #           )

        # train_df[self.input_col] = (
        #             train_df["llm_gen_answer"] + sep
        #             + train_df[self.text_col]
        #           )

        # valid_df[self.input_col] = (
        #             valid_df["llm_gen_answer"] + sep
        #             + valid_df[self.text_col]
        #           )

        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False)

        train_tokenized_datasets = train_dataset.map(self.train_tokenize, batched=False)
        val_tokenized_datasets = val_dataset.map(self.train_tokenize, batched=False)

        model_content = model_config['base_model']['model_content']

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold))

        # https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments

        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # Whether or not to load the best model found during training at the end of training
            optim  = 'adamw_torch',
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size, # The batch size per GPU/TPU core/CPU for training
            per_device_eval_batch_size=8,           # The batch size per GPU/TPU core/CPU for evaluation
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps", # "steps": Evaluation is done (and logged) every eval_steps.
            eval_steps=save_steps,
            save_steps=save_steps, # Number of updates steps before two checkpoint saves if save_strategy="steps"
            metric_for_best_model="rmse", # Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models. Default is loss
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.base_tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator,

        )

        trainer.train()

        model_content.save_pretrained(self.model_dir)
        self.base_tokenizer.save_pretrained(self.model_dir)

        import gc
        model_content.cpu()
        del model_content
        gc.collect()
        torch.cuda.empty_cache()

    def predict(self,
                test_df: pd.DataFrame,
                fold: int,
                ):
        """predict content score"""

        from datasets import Dataset, load_dataset, load_from_disk
        from transformers import TrainingArguments, Trainer

        sep = self.base_tokenizer.sep_token

        self.text_col = "corrected_text" if target=="content" else "text"

        in_text = (
                    test_df["prompt_title"] + sep
                    + test_df["prompt_question"] + sep
                    + test_df[self.text_col]
                  )

        # in_text = (
        #             test_df["prompt_title"] + sep
        #             + test_df["prompt_question"] + sep
        #             + test_df["llm_gen_answer"] + sep
        #             + test_df[self.text_col]
        #           )

        # in_text = (
        #             test_df["llm_gen_answer"] + sep
        #             + test_df[self.text_col]
        #           )

        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]

        test_dataset = Dataset.from_pandas(test_, preserve_index=False)
        test_tokenized_dataset = test_dataset.map(self.test_tokenize, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()

        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold))

        test_args = TrainingArguments(
            output_dir=self.model_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content,
                      tokenizer=self.base_tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

## ############################

## train_by_fold

In [47]:
def train_by_fold(
        train_df: pd.DataFrame,
        target:str,
        model_config: dict
    ):

    model_root_dir = model_config['model_root_dir']
    model_name = model_config['model_name']
    n_splits   = model_config['n_splits']

    for fold in range(Model_Config['n_splits']):
        print(f"fold {fold}:")

        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]

        model_dir =  os.path.join(SYS_VERSION_DIR, model_root_dir, f"{target}/{model_name}/fold_{fold}")

        if os.path.exists(model_dir):
          shutil.rmtree(model_dir)

        model = Model(
            model_config=model_config,
            model_dir = model_dir,
            target=target
           )

        model.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data,
            model_config = model_config
        )

## Run train_by_fold

In [48]:
for target in ["content"]:
    train_by_fold(
        train,
        target=target,
        model_config = Model_Config
    )

fold 0:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (


Map:   0%|          | 0/5108 [00:00<?, ? examples/s]

Map:   0%|          | 0/2057 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [None]:
for target in ["wording"]:
    train_by_fold(
        train,
        target=target,
        model_config = Model_Config
    )

# Evaluation

In [None]:
def predict_train(
    train_df: pd.DataFrame,
    target:str,
    model_config:dict
    ) -> pd.DataFrame:
    """predict oof data"""

    model_root_dir = model_config['model_root_dir']
    model_name = model_config['model_name']
    n_splits = model_config['n_splits']

    for fold in range(n_splits):

        print(f"fold {fold}:")

        valid_data = train_df[train_df["fold"] == fold]

        model_dir =  os.path.join(SYS_VERSION_DIR, model_root_dir, f"{target}/{model_name}/fold_{fold}")

        model = Model(
            model_config = model_config,
            model_dir = model_dir,
            target=target
           )

        pred = model.predict(
            test_df=valid_data,
            fold=fold
        )

        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df

In [None]:
for target in ["content", "wording"]:

  from sklearn.metrics import mean_squared_error

  train = predict_train(
        train,
        target=target,
        model_config = Model_Config
    )
  rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
  print(f"cv {target} rmse: {rmse}")

# Prediction

In [None]:
def predict_test(
      test_df: pd.DataFrame,
      target:str,
      model_config:dict
    ) -> pd.DataFrame:
    """predict using mean folds"""

    model_root_dir = model_config['model_root_dir']
    model_name = model_config['model_name']
    n_splits = model_config['n_splits']

    for fold in range(n_splits):
        print(f"fold {fold}:")

        model_dir =  os.path.join(SYS_VERSION_DIR, model_root_dir, f"{target}/{model_name}/fold_{fold}")

        model = Model(
            model_config = model_config,
            model_dir = model_dir,
            target=target
           )

        pred = model.predict(
            test_df=test_df,
            fold=fold
        )

        test_df[f"{target}_pred_{fold}"] = pred

    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(n_splits)]].mean(axis=1)

    return test_df

In [None]:
for target in ["content", "wording"]:
    test = predict_test(
          test,
          target=target,
          model_config = Model_Config
      )

## Save data

In [None]:
import dill

dill_data = dict()
dill_data['train'] = train ; dill_data['test'] = test
dill_data_filename = os.path.join(SYS_PROJECT_DIR, 'dill_data_2')
with open(dill_data_filename, "wb") as dill_file:
    dill.dump(dill_data, dill_file)

## Load data

In [None]:
import dill

dill_data_filename = os.path.join(SYS_PROJECT_DIR, 'dill_data_2')
with open(dill_data_filename, "rb") as dill_file:
  dill_data = dill.load(dill_file)
train = dill_data['train'] ; test = dill_data['test']

# LGBM

## Data Preparation

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text",
                "prompt_question", "prompt_title",
                "prompt_text" , "corrected_text" , "llm_gen_answer"
               ] + targets

## Modeling

In [None]:
import lightgbm as lgb

n_splits = Model_Config['n_splits']

model_dict = {}

for target in targets:
    models = []

    for fold in range(n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
                  'boosting_type': 'gbdt', # gbdt
                  'random_state': 42,
                  'objective': 'regression',
                  'metric': 'rmse',
                  'learning_rate': 0.05,
                  'min_child_samples' : 10
                  }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=1000,
                          #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=10, verbose=True),
                              lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)

    model_dict[target] = models

## Evaluation (CV)

In [None]:
import numpy as np

rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []

    for fold, model in enumerate(models):
        # ilocで取り出す行を指定
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)

    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

# Prediction

In [None]:
n_splits = Model_Config['n_splits']

drop_columns = [
                #"fold",
                "student_id", "prompt_id", "text",
                "prompt_question", "prompt_title",
                "prompt_text", "corrected_text" , "llm_gen_answer",
                "input"
               ] + [
                f"content_pred_{i}" for i in range(n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(n_splits)
                ]

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)

        pred = model.predict(X_eval_cv)
        preds.append(pred)

    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(n_splits)]].mean(axis=1)

In [None]:
submission_file = os.path.join(SYS_OUTPUT_DIR, "submission.csv")
test[["student_id", "content", "wording"]].to_csv(submission_file, index=False)