In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

In [None]:
EPOCHS=5
BATCH_SIZE=8
max_length=1024
PATH_TO_FOLDER = f'/content/drive/MyDrive/CommonLit/deberta-v3-base/question_title_text/batch_{BATCH_SIZE}/{max_length}'
START_IN_FOLD = 0

In [None]:
os.makedirs(PATH_TO_FOLDER,exist_ok=True)

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install sentencepiece

In [None]:
import pandas as pd
from tqdm import tqdm
import torch
import numpy as np
import torch.nn as nn
import gc
from transformers import AutoTokenizer, AutoModel
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,GroupKFold
import tqdm.notebook as tq
import logging

In [None]:
# !pip install kaggle --upgrade
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c commonlit-evaluate-student-summaries

In [None]:
import zipfile
with zipfile.ZipFile("/content/commonlit-evaluate-student-summaries.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
device

In [None]:
data = pd.read_csv('/content/summaries_train.csv')
prompts_train = pd.read_csv('/content/prompts_train.csv')
data = data.merge(prompts_train, on='prompt_id', how='left')

data['question_title_text'] = data['prompt_question'] + ' ' + tokenizer.sep_token + ' ' + data['prompt_title'] + ' ' + tokenizer.sep_token + ' ' + data['text']


X = data['question_title_text'].to_list()
y = data[['wording', 'content']].values

In [None]:
class Model(nn.Module):
    def __init__(self,hidden_units=768):
        super(Model, self).__init__()
        self.transformer = AutoModel.from_pretrained('microsoft/deberta-v3-base')
        self.transformer.gradient_checkpointing_enable()
        self.linear1 = nn.Linear(768, 768)
        self.linear2 = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        gc.collect()
        torch.cuda.empty_cache()
        model_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = self.mean_pooling(model_output, attention_mask)
        output = self.linear1(embeddings)
        output = nn.ReLU()(output)
        output = self.linear2(output)

        return output

    def mean_pooling(self,model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
def train_model(model,optimizer,criterion,train_loader,val_loader,fold,epochs=10):

    model.train()

    true_word = []
    true_content = []
    pred_word = []
    pred_content = []

    file = open(f"{PATH_TO_FOLDER}/fold_{fold}_log.txt","w")

    for epoch in (range(epochs)):
        gc.collect()
        torch.cuda.empty_cache()
        running_loss = 0.0
        print(f"\n")
        print(f"Epoch: {epoch+1}")
        file.write(f"Epoch: {epoch+1}")
        file.write("\n")
        for step, (input_ids, attention_mask, wording, content) in (enumerate(tq.tqdm(train_loader))):
            gc.collect()
            torch.cuda.empty_cache()

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            content = content.float().to(device)
            wording = wording.float().to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs[:, 0], wording) + criterion(outputs[:, 1], content)

            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            outputs = outputs.detach().cpu().numpy()
            wording = wording.detach().cpu().numpy()
            content = content.detach().cpu().numpy()

            true_word.append(wording)
            true_content.append(content)

            pred_word.append(outputs[:, 0])
            pred_content.append(outputs[:, 1])



        t_w = [float(item) for sublist in true_word for item in sublist]
        t_c = [float(item) for sublist in true_content for item in sublist]
        p_w = [float(item) for sublist in pred_word for item in sublist]
        p_c = [float(item) for sublist in pred_content for item in sublist]


        print("\n")
        print(f"Train Statistic: Loss:{running_loss/len(train_loader)}, MCRMSE: {(mean_squared_error(t_w,p_w,squared=False)+mean_squared_error(t_c,p_c,squared=False))/2}, Wording: {mean_squared_error(t_w,p_w,squared=False)},Content:{mean_squared_error(t_c,p_c,squared=False)}")
        file.write("\n")
        file.write(f"Train Statistic: Loss:{running_loss/len(train_loader)}, MCRMSE: {(mean_squared_error(t_w,p_w,squared=False)+mean_squared_error(t_c,p_c,squared=False))/2}, Wording: {mean_squared_error(t_w,p_w,squared=False)},Content:{mean_squared_error(t_c,p_c,squared=False)}")
        file.write("\n")

        # Validation loop
        model.eval()


        true_word = []
        true_content = []
        pred_word = []
        pred_content = []


        with torch.no_grad():
            val_loss = 0.0

            for val_step, (input_ids, attention_mask, wording, content) in enumerate(tq.tqdm(val_loader)):
                gc.collect()
                torch.cuda.empty_cache()

                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)

                content = content.float().to(device)
                wording = wording.float().to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs[:, 0], wording) + criterion(outputs[:, 1], content)

                val_loss += loss.item()

                outputs = outputs.detach().cpu().numpy()
                wording = wording.detach().cpu().numpy()
                content = content.detach().cpu().numpy()

                true_word.append(wording)
                true_content.append(content)

                pred_word.append(outputs[:, 0])
                pred_content.append(outputs[:, 1])


            t_w = [float(item) for sublist in true_word for item in sublist]
            t_c = [float(item) for sublist in true_content for item in sublist]
            p_w = [float(item) for sublist in pred_word for item in sublist]
            p_c = [float(item) for sublist in pred_content for item in sublist]
            gc.collect()
            torch.cuda.empty_cache()
            print("\n")
            print(f"Validation Statistic: Loss:{val_loss/len(val_loader)}, MCRMSE: {(mean_squared_error(t_w,p_w,squared=False)+mean_squared_error(t_c,p_c,squared=False))/2}, Wording: {mean_squared_error(t_w,p_w,squared=False)},Content:{mean_squared_error(t_c,p_c,squared=False)}")
            file.write("\n")
            file.write(f"Validation Statistic: Loss:{val_loss/len(val_loader)}, MCRMSE: {(mean_squared_error(t_w,p_w,squared=False)+mean_squared_error(t_c,p_c,squared=False))/2}, Wording: {mean_squared_error(t_w,p_w,squared=False)},Content:{mean_squared_error(t_c,p_c,squared=False)}")
            file.write("\n")

    file.close()
    print(f"End of Training fold {fold}! \n")
    return model


In [None]:
def save_df(model,loader,fold,test_index):
    model.eval()
    print(f"Saving Dataframe of Fold {fold}...")
    true_word = []
    true_content = []
    pred_word = []
    pred_content = []
    with torch.no_grad():
        for val_step, (input_ids, attention_mask, wording, content) in enumerate(tq.tqdm(loader)):


            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            content = content.float().to(device)
            wording = wording.float().to(device)
            outputs = model(input_ids, attention_mask)

            wording = wording.detach().cpu().numpy()
            content = content.detach().cpu().numpy()

            true_word.append(wording)
            true_content.append(content)

            outputs = outputs.detach().cpu().numpy()

            pred_word.append(outputs[:, 0])
            pred_content.append(outputs[:, 1])


    df = pd.DataFrame()
    df['idx'] = test_index
    df['true_word'] = [float(item) for sublist in true_word for item in sublist]
    df['true_content'] = [float(item) for sublist in true_content for item in sublist]
    df['pred_word'] = [float(item) for sublist in pred_word for item in sublist]
    df['pred_content'] = [float(item) for sublist in pred_content for item in sublist]
    df.to_csv(PATH_TO_FOLDER+f'/predictions_fold_{fold}.csv',index=False)
    print(f"Dataframe Saved!")

In [None]:
group_kfold = GroupKFold(n_splits=4)
folds = {'39c16e':0,
         '3b9047':1,
         'ebad26':2,
         '814d6b':3
}
groups = np.array([folds[data['prompt_id'][i]] for i in range(len(data))])

for fold, (train_index, test_index) in (enumerate(group_kfold.split(X, y, groups))):

    train_encodings = tokenizer.batch_encode_plus(
    data['question_title_text'][train_index].tolist(),
    max_length=max_length,
    truncation=True,
    padding=True
)
    val_encodings = tokenizer.batch_encode_plus(
    data['question_title_text'][test_index].tolist(),
    max_length=max_length,
    truncation=True,
    padding=True
)

    train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(data['wording'][train_index].tolist()),
    torch.tensor(data['content'][train_index].tolist())
)
    validation_dataset = torch.utils.data.TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(data['wording'][test_index].tolist()),
    torch.tensor(data['content'][test_index].tolist())
)
    gc.collect()
    torch.cuda.empty_cache()

    if START_IN_FOLD<=(fold+1):
        print(f'Begining Fold:{fold+1}')
        model = Model().to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
        criterion = nn.MSELoss()

        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,drop_last=False)
        val_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False,drop_last=False)

        model = train_model(model,optimizer,criterion,train_loader,val_loader,fold+1,EPOCHS)

        torch.save({
            'fold':fold+1,
            'model_state_dict': model.state_dict()
            # 'epoch':best_epoch
        },PATH_TO_FOLDER+f'/fold_{fold+1}')


        save_df(model,val_loader,fold+1,test_index)

# print("\n")
# print("\n")
# print(f"MSRMSE:{np.mean(mcrmse)}")
# print(f"Wording:{np.mean(wording)}")
# print(f"Content:{np.mean(content)}")

Begining Fold:1


Epoch: 1


  0%|          | 0/639 [00:00<?, ?it/s]



Train Statistic: Loss:0.7565736637009142, MCRMSE: 0.6106313656977076, Wording: 0.6848668974584733,Content:0.5363958339369419


  0%|          | 0/258 [00:00<?, ?it/s]



Validation Statistic: Loss:0.501861062016367, MCRMSE: 0.49448747639685126, Wording: 0.5779221789742292,Content:0.41105277381947336


Epoch: 2


  0%|          | 0/639 [00:00<?, ?it/s]



Train Statistic: Loss:0.47502384983084756, MCRMSE: 0.48599435123854606, Wording: 0.5585957114631662,Content:0.4133929910139259


  0%|          | 0/258 [00:00<?, ?it/s]



Validation Statistic: Loss:0.525160882015561, MCRMSE: 0.5031805996286551, Wording: 0.6027565536197034,Content:0.4036046456376068


Epoch: 3


  0%|          | 0/639 [00:00<?, ?it/s]



Train Statistic: Loss:0.37811616618122107, MCRMSE: 0.45342152648333944, Wording: 0.5216364206899494,Content:0.38520663227672947


  0%|          | 0/258 [00:00<?, ?it/s]



Validation Statistic: Loss:0.4770433412099531, MCRMSE: 0.4848406146790982, Wording: 0.5457434190137912,Content:0.4239378103444052


Epoch: 4


  0%|          | 0/639 [00:00<?, ?it/s]



Train Statistic: Loss:0.30716138138364363, MCRMSE: 0.4193279203444227, Wording: 0.46664912703491923,Content:0.3720067136539262


  0%|          | 0/258 [00:00<?, ?it/s]



Validation Statistic: Loss:0.4535264494114144, MCRMSE: 0.47319878084022804, Wording: 0.5253114952284166,Content:0.42108606645203944


Epoch: 5


  0%|          | 0/639 [00:00<?, ?it/s]



Train Statistic: Loss:0.22821056107073695, MCRMSE: 0.38099208638152404, Wording: 0.4170776177583563,Content:0.3449065550046918


  0%|          | 0/258 [00:00<?, ?it/s]



Validation Statistic: Loss:0.5202155047377874, MCRMSE: 0.5040887518195233, Wording: 0.5826314074244541,Content:0.42554609621459244
End of Training fold 1! 

Saving Dataframe of Fold 1...


  0%|          | 0/258 [00:00<?, ?it/s]

Dataframe Saved!
Begining Fold:2


Epoch: 1


  0%|          | 0/645 [00:00<?, ?it/s]



Train Statistic: Loss:0.6991671571551368, MCRMSE: 0.5881713820078742, Wording: 0.6498688405952966,Content:0.526473923420452


  0%|          | 0/252 [00:00<?, ?it/s]



Validation Statistic: Loss:1.311111904089413, MCRMSE: 0.7907431120368139, Wording: 0.9673939397682576,Content:0.6140922843053702


Epoch: 2


  0%|          | 0/645 [00:00<?, ?it/s]



Train Statistic: Loss:0.4353220387831215, MCRMSE: 0.5752042514998708, Wording: 0.67475231784006,Content:0.47565618515968144


  0%|          | 0/252 [00:00<?, ?it/s]



Validation Statistic: Loss:0.973330748814439, MCRMSE: 0.6885235444667511, Wording: 0.8014700992541492,Content:0.575576989679353


Epoch: 3


  0%|          | 0/645 [00:00<?, ?it/s]



Train Statistic: Loss:0.33976086584865584, MCRMSE: 0.5038316369711533, Wording: 0.5731506467027614,Content:0.4345126272395453


  0%|          | 0/252 [00:00<?, ?it/s]



Validation Statistic: Loss:0.8478601492113538, MCRMSE: 0.6392233890948986, Wording: 0.7655714465944793,Content:0.5128753315953181


Epoch: 4


  0%|          | 0/645 [00:00<?, ?it/s]



Train Statistic: Loss:0.2614091870223367, MCRMSE: 0.45723453376144285, Wording: 0.5206193597049239,Content:0.39384970781796186


  0%|          | 0/252 [00:00<?, ?it/s]



Validation Statistic: Loss:0.7523408241806522, MCRMSE: 0.6050724686333612, Wording: 0.708926992139068,Content:0.5012179451276544


Epoch: 5


  0%|          | 0/645 [00:00<?, ?it/s]



Train Statistic: Loss:0.18631246358502743, MCRMSE: 0.4127466278491544, Wording: 0.4612275364028695,Content:0.36426571929543927


  0%|          | 0/252 [00:00<?, ?it/s]



Validation Statistic: Loss:0.8376662787345667, MCRMSE: 0.6351630441581149, Wording: 0.7629660980458561,Content:0.5073599902703736
End of Training fold 2! 

Saving Dataframe of Fold 2...


  0%|          | 0/252 [00:00<?, ?it/s]

Dataframe Saved!
Begining Fold:3


Epoch: 1


  0%|          | 0/647 [00:00<?, ?it/s]



Train Statistic: Loss:0.7621167649556717, MCRMSE: 0.6129745872740138, Wording: 0.6891140921647586,Content:0.5368350823832689


  0%|          | 0/250 [00:00<?, ?it/s]



Validation Statistic: Loss:0.5411752920746803, MCRMSE: 0.5171730667756156, Wording: 0.5701595807842516,Content:0.4641865527669795


Epoch: 2


  0%|          | 0/647 [00:00<?, ?it/s]



Train Statistic: Loss:0.4633470796958372, MCRMSE: 0.48821483390370884, Wording: 0.5535450404378924,Content:0.42288462736952526


  0%|          | 0/250 [00:00<?, ?it/s]