In [20]:
import os
import pandas as pd
import numpy as np

In [21]:
os.chdir("C:/Users/shaur/Downloads/commonlit")
os.getcwd()

'C:\\Users\\shaur\\Downloads\\commonlit'

In [22]:
prompts_train = pd.read_csv("prompts_train.csv")
prompts_test = pd.read_csv("prompts_test.csv")

summaries_train = pd.read_csv("summaries_train.csv")
summaries_test = pd.read_csv("summaries_test.csv")

sample_submission = pd.read_csv("sample_submission.csv")

prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [23]:
train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

In [24]:
import warnings
import logging
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset, load_dataset, load_from_disk, load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupShuffleSplit
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [25]:
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
disable_progress_bar()

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [27]:
def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)

    return (content_score + wording_score)/2

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(seed=42)

In [28]:
splitter = GroupShuffleSplit(test_size=.2, n_splits=4, random_state=42)
split = splitter.split(train, groups=train['prompt_id'])
train_ind, val_ind = next(split)

train_split = train.iloc[train_ind]
val_split = train.iloc[val_ind]

In [29]:
train_split.prompt_id.value_counts()

prompt_id
39c16e    2057
ebad26    1996
814d6b    1103
Name: count, dtype: int64

In [30]:
val_split.prompt_id.value_counts()

prompt_id
3b9047    2009
Name: count, dtype: int64

In [31]:
train_content = train_split[["text", "content", "wording"]]
val_content = val_split[["text", "content", "wording"]]
test_ = test[["text"]]

In [32]:
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
seed_everything(seed=42)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

In [33]:
train_dataset_content = Dataset.from_pandas(train_content, preserve_index=False)
val_dataset_content = Dataset.from_pandas(val_content, preserve_index=False)
test_dataset = Dataset.from_pandas(test_, preserve_index=False)

In [34]:
def tokenize_function(examples):
    labels = [examples["content"], examples["wording"]]
    tokenized = tokenizer(examples["text"],
                         padding=False,
                         truncation=True,
                        )
    return {
        **tokenized,
        "labels": labels,
        }

def tokenize_function_test(examples):
    tokenized = tokenizer(examples["text"],
                         padding=False,
                         truncation=True,
                         )
    return tokenized

train_tokenized_datasets_content = train_dataset_content.map(tokenize_function, batched=False)
val_tokenized_datasets_content = val_dataset_content.map(tokenize_function, batched=False)
test_tokenized_dataset = test_dataset.map(tokenize_function_test, batched=False)


In [35]:
model_dir = f"./Results/{model_name}_results"
os.makedirs(model_dir, exist_ok=True)

In [36]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [37]:
training_args = TrainingArguments(
    output_dir = model_dir,
    load_best_model_at_end = True,
    learning_rate = 1.5e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    greater_is_better=False,
    evaluation_strategy="epoch",
    metric_for_best_model="mcrmse",
    save_total_limit=4
)

trainer_content = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized_datasets_content,
        eval_dataset=val_tokenized_datasets_content,
        tokenizer=tokenizer,
        compute_metrics=compute_mcrmse,
        data_collator=data_collator
    )

In [38]:
trainer_content.train()

                                                  
 33%|███▎      | 430/1290 [02:53<03:03,  4.70it/s]

{'eval_loss': -4.300253868103027, 'eval_content_rmse': 9.076205253601074, 'eval_wording_rmse': 8.771636962890625, 'eval_mcrmse': 8.923921585083008, 'eval_runtime': 22.173, 'eval_samples_per_second': 90.606, 'eval_steps_per_second': 11.365, 'epoch': 1.0}


 39%|███▉      | 500/1290 [03:22<06:01,  2.18it/s]  

{'loss': -3.376, 'learning_rate': 9.186046511627908e-06, 'epoch': 1.16}


                                                  
 67%|██████▋   | 860/1290 [05:51<02:17,  3.12it/s]

{'eval_loss': -7.837543964385986, 'eval_content_rmse': 15.555355072021484, 'eval_wording_rmse': 15.30950927734375, 'eval_mcrmse': 15.432432174682617, 'eval_runtime': 22.205, 'eval_samples_per_second': 90.475, 'eval_steps_per_second': 11.349, 'epoch': 2.0}


 78%|███████▊  | 1000/1290 [06:43<01:47,  2.69it/s]

{'loss': -9.0191, 'learning_rate': 3.372093023255814e-06, 'epoch': 2.33}


                                                   
100%|██████████| 1290/1290 [08:48<00:00,  2.47it/s]

{'eval_loss': -9.418931007385254, 'eval_content_rmse': 19.42833709716797, 'eval_wording_rmse': 19.168067932128906, 'eval_mcrmse': 19.298202514648438, 'eval_runtime': 22.2616, 'eval_samples_per_second': 90.245, 'eval_steps_per_second': 11.32, 'epoch': 3.0}


100%|██████████| 1290/1290 [08:49<00:00,  2.43it/s]

{'train_runtime': 529.7899, 'train_samples_per_second': 29.196, 'train_steps_per_second': 2.435, 'train_loss': -7.398552000060562, 'epoch': 3.0}





TrainOutput(global_step=1290, training_loss=-7.398552000060562, metrics={'train_runtime': 529.7899, 'train_samples_per_second': 29.196, 'train_steps_per_second': 2.435, 'train_loss': -7.398552000060562, 'epoch': 3.0})