In [1]:
import pandas as pd
import numpy as np

import os
import shutil
import json
import warnings
import logging
import time

import torch

import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from datasets import Dataset,load_dataset, load_from_disk
from datasets import load_metric, disable_progress_bar

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import KFold, GroupKFold


warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()

In [2]:
PATH = 'C:/Users/shaur/Downloads/commonlit-evaluate-student-summaries'

prompts_train = pd.read_csv(f'{PATH}/prompts_train.csv')
prompts_test = pd.read_csv(f'{PATH}/prompts_test.csv')

summaries_train = pd.read_csv(f'{PATH}/summaries_train.csv')
summaries_test = pd.read_csv(f'{PATH}/summaries_test.csv')

sample_submission = pd.read_csv(f'{PATH}/sample_submission.csv')

prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [3]:
train = summaries_train.merge(prompts_train, how="left", on="prompt_id")#[:2000]
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7165 entries, 0 to 7164
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   student_id       7165 non-null   object 
 1   prompt_id        7165 non-null   object 
 2   text             7165 non-null   object 
 3   content          7165 non-null   float64
 4   wording          7165 non-null   float64
 5   prompt_question  7165 non-null   object 
 6   prompt_title     7165 non-null   object 
 7   prompt_text      7165 non-null   object 
dtypes: float64(2), object(6)
memory usage: 447.9+ KB


In [5]:
train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


In [6]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [7]:
class CFG:
    model_name="roberta-base"
    learning_rate=1.2e-5
    warmup_ratio=0.01
    weight_decay=0.02
    hidden_dropout_prob=0.01
    attention_probs_dropout_prob=0.01
    num_layers_to_freeze=100
    num_train_epochs=3
    n_splits=4
    batch_size=9
    random_seed=42
    save_steps=70
    max_length=512

In [8]:
def train_n_infer(train,
                 val,
                 model_name,
                 batch_size,
                 learning_rate,
                 warmup_ratio,
                  weight_decay,
                 hidden_dropout_prob,
                 attention_probs_dropoup_prob,
#                  num_layers_to_freeze,
                 num_train_epochs,
                 save_steps,
                 random_seed,
                 max_length,
                 model_dir):
    train_content = train[["text","content","wording"]]
    val_content = val[["text","content","wording"]]
    test_content = test[["text"]]
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.update({
        "num_labels":2,
        "problem_type":"regression"
    })
    device = torch.device("cuda" if torch.cuda.is_available() else cpu)
    model_content = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_config).to(device)
    
#     for name, param in list(model_content.named_parameters())[: num_layers_to_freeze]:
#         param.requires_grad = False
    
    def tokenize_function(examples):
        labels = [examples["content"], examples["wording"]]
        tokenized = tokenizer(examples["text"],
                         padding=False,
                         truncation=True,
                         max_length=max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(examples):
        tokenized = tokenizer(examples["text"],
                         padding=False,
                         truncation=True,
                         max_length=max_length)
        return tokenized
    train_dataset_content = Dataset.from_pandas(train_content, preserve_index=False) 
    val_dataset_content = Dataset.from_pandas(val_content, preserve_index=False)
    
    train_tokenized_datasets_content = train_dataset_content.map(tokenize_function, batched=False)
    val_tokenized_datasets_content = val_dataset_content.map(tokenize_function, batched=False)

    test_dataset = Dataset.from_pandas(test_content, preserve_index=False) 
    test_tokenized_dataset = test_dataset.map(tokenize_function_test, batched=False)
    
    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer
    )
    training_args = TrainingArguments(output_dir=model_dir,
                                      load_best_model_at_end=True,
                                      learning_rate=learning_rate,
                                      warmup_ratio=warmup_ratio,
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=8,
                                      num_train_epochs=num_train_epochs,
                                      weight_decay=weight_decay,
                                      report_to='none',
                                      disable_tqdm=True,
#                                       greater_is_better=False,
                                      save_strategy="steps",
                                      evaluation_strategy="steps",
                                      eval_steps=save_steps,
                                      save_steps=save_steps,
                                      metric_for_best_model="mcrmse",
                                      save_total_limit=5
                                     )
    trainer_content = Trainer(
        model=model_content,
        args=training_args,
        train_dataset=train_tokenized_datasets_content,
        eval_dataset=val_tokenized_datasets_content,
        tokenizer=tokenizer,
        compute_metrics=compute_mcrmse,#compute_metrics,
        data_collator=data_collator
    )
    trainer_content.train()
    
    time.sleep(5)
    best_check = os.listdir(model_dir)[0]
    model_content = AutoModelForSequenceClassification.from_pretrained(f"{model_dir}/{best_check}")
    model_content.eval()

    test_args = TrainingArguments(
        output_dir=model_dir,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 4,   
        dataloader_drop_last = False,
    )

    # init trainer
    infer_content = Trainer(
                  model = model_content, 
                  tokenizer=tokenizer,
                  data_collator=data_collator,
                  args = test_args)

    val_results_content = infer_content.predict(val_tokenized_datasets_content)[0]
    test_results_content = infer_content.predict(test_tokenized_dataset)[0]
    
    model_content.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    
#     try:
#         shutil.rmtree(f"{model_dir}/{best_check}")
#     except Exception:
#         pass
#     time.sleep(5)

    return val_results_content, test_results_content

In [9]:
def get_oof_pred_n_test(train,
                       model_name,
                       n_splits,
                       batch_size,
                       learning_rate,
                        warmup_ratio,
                        hidden_dropout_prob,
                        attention_probs_dropout_prob,
                        num_layers_to_freeze,
                        weight_decay,
                        num_train_epochs,
                        random_seed,
                        save_steps,
                        max_length
                       ):
    kf1 = GroupKFold(n_splits)
    oof_content = np.zeros((len(train), 2))
    test_pred_content = np.zeros((len(test), 2))
    
    model_name_ = model_name+"12"
    if not os.path.exists(model_name_):
        try:
            os.mkdir(model_name_)
        except Exception:
            pass
    time.sleep(5)
    
    for i, (train_indx, val_indx) in enumerate(kf1.split(train,  groups=train["prompt_id"])):
        print(f"fold {i}:")
        train_ = train.iloc[train_indx]
        val_ = train.iloc[val_indx]
        
        val_res_content, test_res_content = train_n_infer(train_,
                                                         val_,
                                                         model_name,
                                                         batch_size,
                                                         learning_rate,
                                                         warmup_ratio,
                                                         hidden_dropout_prob,
                                                          attention_probs_dropout_prob,
                                                          weight_decay,
                                                          num_train_epochs,
                                                          save_steps,
                                                          random_seed,
                                                          max_length,
                                                          model_dir=f"{model_name_}/fold_{i}"
                                                         )
        oof_content[val_indx] = val_res_content
        test_pred_content += test_res_content/n_splits
    
    oof_train = pd.DataFrame(oof_content, columns=[f"content_pred_{model_name_}", f"wording_pred_{model_name_}"])
    test_pred = pd.DataFrame(test_pred_content, columns=[f"content_pred_{model_name_}", f"wording_pred_{model_name_}"])

    cv_metric = compute_mcrmse((oof_train.values, train[["content", "wording"]]))
    print(f"cv mcrmse: {cv_metric}")
    with open(f"{model_name_}/cv_metric.json", "w") as outfile:
        json.dump(cv_metric, outfile)
        
    oof_train.to_csv(f"{model_name_}/oof_train.csv", index=False)
    test_pred.to_csv(f"test_pred.csv", index=False)
    
    return oof_train, test_pred

In [10]:
oof_train, test_pred  = get_oof_pred_n_test(train,
                                            model_name=CFG.model_name,
                                            learning_rate=CFG.learning_rate,
                                            warmup_ratio=CFG.warmup_ratio,
                                            hidden_dropout_prob=CFG.hidden_dropout_prob,
                                            attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
                                            num_layers_to_freeze=CFG.num_layers_to_freeze,
                                            weight_decay=CFG.weight_decay,
                                            num_train_epochs=CFG.num_train_epochs,
                                            n_splits=CFG.n_splits,
                                            batch_size=CFG.batch_size,
                                            random_seed=CFG.random_seed,
                                            save_steps=CFG.save_steps,
                                            max_length=CFG.max_length
                                           )

fold 0:
{'eval_loss': 0.5523695349693298, 'eval_content_rmse': 0.5763614177703857, 'eval_wording_rmse': 0.8789457082748413, 'eval_mcrmse': 0.7276535630226135, 'eval_runtime': 21.9721, 'eval_samples_per_second': 93.619, 'eval_steps_per_second': 11.742, 'epoch': 0.12}
{'eval_loss': 0.33143454790115356, 'eval_content_rmse': 0.4698494076728821, 'eval_wording_rmse': 0.6649138927459717, 'eval_mcrmse': 0.5673816204071045, 'eval_runtime': 20.5876, 'eval_samples_per_second': 99.914, 'eval_steps_per_second': 12.532, 'epoch': 0.25}
{'eval_loss': 0.36663180589675903, 'eval_content_rmse': 0.5179646611213684, 'eval_wording_rmse': 0.6818918585777283, 'eval_mcrmse': 0.5999282598495483, 'eval_runtime': 21.726, 'eval_samples_per_second': 94.679, 'eval_steps_per_second': 11.875, 'epoch': 0.37}
{'eval_loss': 0.31590697169303894, 'eval_content_rmse': 0.46712514758110046, 'eval_wording_rmse': 0.6431236863136292, 'eval_mcrmse': 0.5551244020462036, 'eval_runtime': 21.7343, 'eval_samples_per_second': 94.643, '

fold 1:
{'eval_loss': 0.5616319179534912, 'eval_content_rmse': 0.6678569912910461, 'eval_wording_rmse': 0.822939932346344, 'eval_mcrmse': 0.7453984618186951, 'eval_runtime': 25.3802, 'eval_samples_per_second': 79.156, 'eval_steps_per_second': 9.929, 'epoch': 0.12}
{'eval_loss': 0.5942267179489136, 'eval_content_rmse': 0.5917056202888489, 'eval_wording_rmse': 0.9156083464622498, 'eval_mcrmse': 0.7536569833755493, 'eval_runtime': 30.538, 'eval_samples_per_second': 65.787, 'eval_steps_per_second': 8.252, 'epoch': 0.24}
{'eval_loss': 0.5398957133293152, 'eval_content_rmse': 0.613020658493042, 'eval_wording_rmse': 0.839045524597168, 'eval_mcrmse': 0.726033091545105, 'eval_runtime': 30.6544, 'eval_samples_per_second': 65.537, 'eval_steps_per_second': 8.221, 'epoch': 0.37}
{'eval_loss': 0.6434057950973511, 'eval_content_rmse': 0.7209590673446655, 'eval_wording_rmse': 0.8758023381233215, 'eval_mcrmse': 0.7983807325363159, 'eval_runtime': 48.2202, 'eval_samples_per_second': 41.663, 'eval_steps_

fold 2:
{'eval_loss': 0.7648281455039978, 'eval_content_rmse': 0.8080994486808777, 'eval_wording_rmse': 0.9362862706184387, 'eval_mcrmse': 0.8721928596496582, 'eval_runtime': 26.5362, 'eval_samples_per_second': 75.218, 'eval_steps_per_second': 9.421, 'epoch': 0.12}
{'eval_loss': 0.44533291459083557, 'eval_content_rmse': 0.5820702910423279, 'eval_wording_rmse': 0.7428726553916931, 'eval_mcrmse': 0.6624714732170105, 'eval_runtime': 27.3911, 'eval_samples_per_second': 72.87, 'eval_steps_per_second': 9.127, 'epoch': 0.24}
{'eval_loss': 0.4338538348674774, 'eval_content_rmse': 0.549263060092926, 'eval_wording_rmse': 0.7523418664932251, 'eval_mcrmse': 0.650802493095398, 'eval_runtime': 31.3691, 'eval_samples_per_second': 63.629, 'eval_steps_per_second': 7.97, 'epoch': 0.37}
{'eval_loss': 0.43823450803756714, 'eval_content_rmse': 0.5028978586196899, 'eval_wording_rmse': 0.7896600961685181, 'eval_mcrmse': 0.646278977394104, 'eval_runtime': 31.3241, 'eval_samples_per_second': 63.721, 'eval_step

fold 3:
{'eval_loss': 0.5794399976730347, 'eval_content_rmse': 0.5541487336158752, 'eval_wording_rmse': 0.9229298830032349, 'eval_mcrmse': 0.7385393381118774, 'eval_runtime': 9.1024, 'eval_samples_per_second': 121.177, 'eval_steps_per_second': 15.161, 'epoch': 0.1}
{'eval_loss': 0.5783413052558899, 'eval_content_rmse': 0.5390334129333496, 'eval_wording_rmse': 0.9306588172912598, 'eval_mcrmse': 0.7348461151123047, 'eval_runtime': 9.1865, 'eval_samples_per_second': 120.067, 'eval_steps_per_second': 15.022, 'epoch': 0.21}
{'eval_loss': 0.896065354347229, 'eval_content_rmse': 0.7334596514701843, 'eval_wording_rmse': 1.1198965311050415, 'eval_mcrmse': 0.9266780614852905, 'eval_runtime': 8.5297, 'eval_samples_per_second': 129.313, 'eval_steps_per_second': 16.179, 'epoch': 0.31}
{'eval_loss': 0.7315965890884399, 'eval_content_rmse': 0.6348586678504944, 'eval_wording_rmse': 1.0296345949172974, 'eval_mcrmse': 0.8322466611862183, 'eval_runtime': 8.492, 'eval_samples_per_second': 129.887, 'eval_s

cv mcrmse: {'content_rmse': 0.5183165421949071, 'wording_rmse': 0.7267120956785867, 'mcrmse': 0.6225143189367469}


In [11]:
test_pred = pd.read_csv(f"test_pred.csv")

sample_submission["content"] = test_pred.values[:, 0]
sample_submission["wording"] = test_pred.values[:, 1]

sample_submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.116083,-0.857669
1,111111eeeeee,-1.116251,-0.863316
2,222222cccccc,-1.113319,-0.868946
3,333333dddddd,-1.117513,-0.873461
