In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from transformers import RobertaModel, AutoTokenizer, get_linear_schedule_with_warmup
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch import nn
import sklearn
from tqdm.notebook import tqdm

from nltk.corpus import stopwords

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
torch.manual_seed(2021)
torch.cuda.manual_seed(2021)
torch.cuda.manual_seed_all(2021)
np.random.seed(2021)

In [None]:
test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
test

In [None]:
sample = pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')
sample

In [None]:
train = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
train['excerpt'][1]

In [None]:
train.describe()

In [None]:
print('Before stopword removal\n\n')
print(train['excerpt'][0])
stop_words = set(stopwords.words('english'))
print('After stopword removal\n\n')
print(" ".join([x for x in train['excerpt'][0].split(" ") if x not in stop_words]))

In [None]:
print('Before special characters removed\n')
print(train['excerpt'][0])

print('After special characters removed\n')
print(" ".join(re.findall(r"\w+", train['excerpt'][0])))

In [None]:
train['excerpt'][5]

In [None]:
# # Stop words
train['excerpt'] = train['excerpt'].apply(lambda x : 
             " ".join([k for k in x.split(" ") if k not in stop_words]))
# Special characters
train['excerpt'] = train['excerpt'].apply(lambda x: " ".join(re.findall(r"\w+", x)))

# # Stop words
test['excerpt'] = test['excerpt'].apply(lambda x : 
             " ".join([k for k in x.split(" ") if k not in stop_words]))
# Special characters
test['excerpt'] = test['excerpt'].apply(lambda x: " ".join(re.findall(r"\w+", x)))

In [None]:
train['excerpt'][5]

In [None]:
train['target'].hist()

In [None]:
train['length'] = train['excerpt'].apply(lambda x: len(x.split(' ')))
train['length'].hist()
print('Value Counts:',train['length'].value_counts())
train.drop(columns=['length'], inplace=True)

In [None]:
sns.heatmap(train[['target', 'standard_error']].corr(), vmin=-1, vmax=1, cmap="RdYlGn", annot=True)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
train = train[['excerpt', 'standard_error', 'target']]

In [None]:
class Model(nn.Module):

    def __init__(
        self,
#         n_classes = CFG.classes,
        model_name = '../input/bert-base-uncased',
        fc_dim = 768,
#         margin = CFG.margin,
#         scale = CFG.scale,
        use_fc = True
    ):

        super(Model,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.backbone = RobertaModel.from_pretrained(model_name).to(device)
        self.backbone = transformers.BertForSequenceClassification.from_pretrained(model_name,num_labels=1)
        self.backbone.cuda()

        in_features = 768
        self.use_fc = use_fc
        
        if use_fc:
#             self.dropout = nn.Dropout(p=0.0)
#             self.classifier = nn.Linear(in_features, fc_dim)
#             self.bn = nn.BatchNorm1d(fc_dim)
            self.dropout = nn.Dropout(0.3)
            self.final = nn.Linear(768, 1)
#             self.layer_norm = nn.LayerNorm(in_features, elementwise_affine=False)
#             print(self.layer_norm)
#             self.relu = nn.ReLU()
#             self._init_params()
            
            
#             self._init_params()
#             in_features = fc_dim
            

#     def forward(self, texts, labels=torch.tensor([0])):
#         features = self.extract_features(texts)
#         if self.training:
#             logits = self.final(features, labels.to(device))
#             return logits
#         else:
#             return features
        
    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.xavier_normal_(self.final.weight)
#         self.layer_norm.bias.data.zero_()
#         self.layer_norm.weight.data.fill_(1.0)
    
    def forward(self, texts):
        encoding = self.tokenizer(texts, padding=True, truncation=True,
                             max_length=150, return_tensors='pt').to(device)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        embedding = self.backbone(input_ids, attention_mask=attention_mask)
        return embedding

In [None]:
import transformers
model = Model()

In [None]:
class ExcerptDataset(torch.utils.data.Dataset):
    def __init__(self, df, text_column, label_column=None, submission=False):
        texts = df[text_column]
        self.submission=submission
        if not submission:
            self.labels = df[label_column].values
        
        self.titles = []
        for title in texts:
#             title = title.encode('utf-8').decode("unicode_escape")
#             title = title.encode('ascii', 'ignore').decode("unicode_escape")
#             title = title.lower()
            self.titles.append(title)

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        text = self.titles[idx]
        if not self.submission:
            label = torch.tensor(self.labels[idx], dtype=torch.float32)
            label = label.reshape(1)
            return text, label
        else:
            return text

Naive train test split

In [None]:
# train, val = sklearn.model_selection.train_test_split(train)

StratifyKFold

In [None]:
data = train.copy()
num_bins = int(np.floor(1 + np.log2(len(data)))) # Sturge's Rule
data["bins"] = pd.cut(data['target'], bins=num_bins, labels=False)

kf = sklearn.model_selection.StratifiedKFold(3)

for i, (train_index, test_index) in enumerate(kf.split(data, data['bins'])):
    data.loc[test_index, 'kfold'] = i

data = data.drop(columns=['bins'])

for i, group in enumerate(data.groupby('kfold')):
#     print(group[1]['target'])
    plt.subplot(3, 1, i+1)
    plt.hist(group[1]['target'])
#     break
print("K-Fold Histogram of Target")
plt.show()

In [None]:
data['kfold'] = data['kfold'].astype(int)
data

In [None]:
# train_dataset = ExcerptDataset(train, 'excerpt', 'target')
# train_dataloader = torch.utils.data.DataLoader(
#     train_dataset,
#     batch_size = 32,
#     num_workers = 4,
#     pin_memory = True,
#     shuffle = True,
#     drop_last = False
# )
# val_dataset = ExcerptDataset(val, 'excerpt', 'target')
# val_dataloader = torch.utils.data.DataLoader(
#     val_dataset,
#     batch_size = 32,
#     num_workers = 4,
#     pin_memory = True,
#     shuffle = True,
#     drop_last = False
# )

In [None]:
# model(next(iter(train_dataloader))[0]).logits

In [None]:
def validate(model, val_dataloader, loss_fn):
    with torch.no_grad():
        average_loss=0
        for (texts,labels) in val_dataloader:
            texts = list(texts)
            labels = labels.to(device)

            outputs = model(texts).logits
            loss = loss_fn(outputs, labels)
            average_loss += loss.item()
        average_loss = average_loss/len(val_dataloader)
        del model
        gc.collect()
        return average_loss

In [None]:
# epocs=10
# optimizer=torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# loss_fn=nn.MSELoss()
# total_steps = len(train_dataloader)*epocs
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=total_steps)
# train_loss_history=[]
# val_loss_history=[]
# best_val=None
# progress = tqdm(range(epocs))
# for i in progress:
#     tk=tqdm(train_dataloader, desc="Training epoch: "+str(i+1))
#     epoch_loss=0
#     for j, (texts, labels) in enumerate(tk):
#         texts = list(texts)
#         labels = labels.to(device)
        
#         outputs = model(texts).logits
# #         print(outputs.detach().cpu().numpy())
# #         break
#         loss = loss_fn(outputs, labels)
#         epoch_loss += loss.item()
        
#         optimizer.zero_grad()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         scheduler.step()
# #         break
#     epoch_loss = epoch_loss/len(train_dataloader)
#     val_loss = validate(model, val_dataloader, loss_fn)
#     val_loss_history.append(val_loss)
#     train_loss_history.append(epoch_loss)
#     progress.set_postfix({'Epoch Loss': epoch_loss, 'Val Loss':val_loss})
#     if (best_val==None or val_loss<best_val) and (i>=3):
#         print(f"Saving model at epoch {i+1}")
#         torch.save(model.state_dict(), f'model_epoch{i+1}')
#     print(f'Epoch Loss {epoch_loss}, Val Loss {val_loss}')
    

In [None]:
train_dataset = ExcerptDataset(data[data['kfold']!=0], 'excerpt', 'target')
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = 32,
    num_workers = 4,
    pin_memory = True,
    shuffle = True,
    drop_last = False
)

Uncomment to do training

In [None]:
# from transformers import AdamW
# epocs=6
# train_loss_history=[]
# val_loss_history=[]
# best_val=None
# progress = tqdm(range(epocs))
# for fold in range(3):
#     model=Model()
#     train = data[data['kfold']!=fold]
#     val = data[data['kfold']==fold]
#     train_dataset = ExcerptDataset(train, 'excerpt', 'target')
#     train_dataloader = torch.utils.data.DataLoader(
#         train_dataset,
#         batch_size = 32,
#         num_workers = 4,
#         pin_memory = True,
#         shuffle = True,
#         drop_last = False
#     )
#     val_dataset = ExcerptDataset(val, 'excerpt', 'target')
#     val_dataloader = torch.utils.data.DataLoader(
#         val_dataset,
#         batch_size = 32,
#         num_workers = 4,
#         pin_memory = True,
#         shuffle = True,
#         drop_last = False
#     )
#     optimizer=AdamW(model.parameters(),
#                   lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                   eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
#                   weight_decay= 1e-1
#                 )
#     loss_fn=nn.MSELoss()
#     total_steps = len(train_dataloader)*epocs
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=total_steps)
#     print("Training Fold:",(fold+1))
#     best_val=999
#     for i in progress:
#         tk=tqdm(train_dataloader, desc="Training epoch: "+str(i+1))
#         epoch_loss=0
#         for j, (texts, labels) in enumerate(tk):
#             texts = list(texts)
#             labels = labels.to(device)

#             outputs = model(texts).logits
#     #         print(outputs.detach().cpu().numpy())
#     #         break
#             loss = loss_fn(outputs, labels)
#             epoch_loss += loss.item()

#             optimizer.zero_grad()
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#             optimizer.step()
#             scheduler.step()
#     #         break
#         epoch_loss = epoch_loss/len(train_dataloader)
#         val_loss = validate(model, val_dataloader, loss_fn)
#         val_loss_history.append(val_loss)
#         train_loss_history.append(epoch_loss)
#         progress.set_postfix({'Epoch Loss': epoch_loss, 'Val Loss':val_loss})
#         if (best_val==999 or val_loss<best_val):
#             print(f"Saving model at epoch {i+1}")
#             torch.save(model.state_dict(), f'model_fold{fold+1}')
#             best_val=val_loss
#         print(f'Epoch Loss {epoch_loss}, Val Loss {val_loss}')

In [None]:
# plt.plot(train_loss_history)
# plt.plot(val_loss_history)

In [None]:
test_dataset = ExcerptDataset(test, 'excerpt', submission=True)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = 2,
    num_workers = 4,
    pin_memory = True,
    shuffle = False,
    drop_last = False
)

In [None]:
all_labels = []
models = []
for fold in range(3):
    model=Model()
    model.eval()
    model.load_state_dict(torch.load(f'../input/pretrained-readability/model_fold{fold+1}'))
    models.append(model)
for fold in range(3):
    labels=[]
    for texts in test_dataloader:
        with torch.no_grad():
            output = models[fold](texts).logits
            labels.extend(output.cpu().numpy())
            
    all_labels.append(labels)

In [None]:
all_labels = np.array(all_labels, dtype=object)
# print(all_labels)
print("All outputs")
all_labels = all_labels.reshape(3, -1).transpose()
print(all_labels)
print("Aggregated outputs")
mean_labels = np.mean(all_labels, axis=1)
print(mean_labels)


In [None]:
test['target']=-1
test['target']=mean_labels

In [None]:
test=test[['id', 'target']]
test

In [None]:
test.to_csv("submission.csv", index=False)

In [None]:
test