In [None]:
import numpy as np 
import pandas as pd 

import os

import seaborn as sns
import matplotlib.pyplot as plt

import re
import unicodedata
import nltk
from nltk.corpus import stopwords

from wordcloud import WordCloud, STOPWORDS 
from PIL import Image
import requests

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from transformers import RobertaTokenizer, RobertaModel
from transformers import AdamW

from tqdm import tqdm


# Setting color palette.
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]

In [None]:
# reading the data
train = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('/kaggle/input/commonlitreadabilityprize/sample_submission.csv')

# Data analysis and visualization

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe(include='all')

In [None]:
test.head()

In [None]:
sample.head()

In [None]:
# Checking missing values:

def missing_percentage(df):

    total = df.isnull().sum().sort_values(ascending=False)[df.isnull().sum()!= 0]
    percent = (df.isnull().sum().sort_values(ascending=False) / len(df) *
               100)[(df.isnull().sum()/ len(df) * 100) != 0]
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])


missing_train = missing_percentage(train)

plt.figure(figsize=(7, 4))

sns.barplot(x=missing_train.index,
            y='Percent',
            data=missing_train,
            palette=orange_black).set_title('Train Data Missing Values')

missing_percentage(train)

In [None]:
# adding word number as independent variable
train['word_n'] = train['excerpt'].apply(lambda x: len(x.split()))

In [None]:
# function to plot distribution of a variable
def plot_distribution(variable):
    x = variable
    fig, axes = plt.subplots(ncols=2, figsize=(12, 4))
    for ax in axes:
        sns.kdeplot(x, shade=False, color='crimson', ax=ax)
        kdeline = ax.lines[0]
        xs = kdeline.get_xdata()
        ys = kdeline.get_ydata()
        if ax == axes[0]:
            middle = x.mean()
            sdev = x.std()
            left = middle - sdev
            right = middle + sdev
            ax.set_title('Showing mean and sdev')
        else:
            left, middle, right = np.percentile(x, [25, 50, 75])
            ax.set_title('Showing median and quartiles')
        ax.vlines(middle, 0, np.interp(middle, xs, ys), color='crimson', ls=':')
        ax.fill_between(xs, 0, ys, facecolor='crimson', alpha=0.2)
        ax.fill_between(xs, 0, ys, where=(left <= xs) & (xs <= right), interpolate=True, facecolor='crimson', alpha=0.2)
        # ax.set_ylim(ymin=0)
    plt.xlabel(f'{x.name}', fontsize=10)
    plt.show()

target normally distributed with mean -1

In [None]:
# target distribution
plot_distribution(train['target'])

standard error distribution is skewed to the left with a mean close to 0.5.

In [None]:
# Standard error distribution
plot_distribution(train['standard_error'])

In [None]:
# word_n distribution
plot_distribution(train['word_n'])

Target decreased when word number increased. From about -0.5 when word_n = 140 to -1 when word_n = 200

In [None]:
# relationship between number of words per text and score
plt.figure(figsize=(8, 4))

sns.lineplot(x='word_n',
             y='target',
             data=train,
             palette=orange_black[:2],
             ci=None).set_title('target vs. word_n')


plt.show()

When standard error is the lowest (about 0.4), the target is equal to -1

In [None]:
# relationship between standard error and score
plt.figure(figsize=(8, 4))

sns.lineplot(x='target',
             y='standard_error',
             data=train,
             palette=orange_black[:2],
             ci=None).set_title('target vs. standard_error')


plt.show()

In [None]:
# Function to create box plot
def box_count_plot(variable, title_var, fig=(10,8), rot=90):
    fig, axs = plt.subplots(nrows=2, figsize = fig)

    sns.boxplot(
        x=variable,
        y='target',
        data=train,
        ax=axs[0]
    ).set_title('Boxplot of ' + title_var + ' vs. score', size=20)

    sns.countplot(
        x=variable,
        data=train,
        order = train[variable].value_counts().index,
        ax=axs[1]
    )

    # Draw median price
    axs[0].axhline(
        y=train['target'].median(), 
        color='red',
        linestyle='dotted'
    )

    # Label the bars with counts
    for patch in axs[1].patches:
        x = patch.get_bbox().get_points()[:, 0]
        y = patch.get_bbox().get_points()[1, 1]
        axs[1].annotate(f'{int(y)}', (x.mean(), y), ha='center', va='bottom')
    
    # Format x-axes
    axs[1].set_xticklabels(axs[1].xaxis.get_majorticklabels(), rotation=rot)
    axs[0].xaxis.set_visible(False)
    
    axs[0].set(ylabel='Score')
    axs[1].set(xlabel=title_var, ylabel='Count')

    # Narrow the gap between the plots
    plt.subplots_adjust(hspace=0.05)

* CC BY 4.0 is the license with highest frequency (391). Mean score with this license is slightly above the mean (-1).
* CC BY-SA 3.0 and GFDL is the license with the second highest frequency (196). Mean score with this license is below the mean (-1.5).
* CC BY-SA 3.0 is the license with the third highest frequency (192). Mean score with this license is above the mean (-0.1).

In [None]:
train['license'].unique()

In [None]:
# License vs. score
box_count_plot('license', 'License', fig=(10,8), rot=90)

Example of texts with highest/lowest score

In [None]:
# Looking at texts with highest score
train.sort_values(['target'], ascending=False)['excerpt'].reset_index(drop=True).all()

In [None]:
# Looking at texts with lowest score
train.sort_values(['target'], ascending=True)['excerpt'].reset_index(drop=True).all()

### Drawing n-gram plots

In [None]:
# add appropriate words that will be ignored in the analysis
additional_stopwords = []

# cleaning words
def basic_clean(text):
    """
    A simple function to clean up the data. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.
    """
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords
    text = (unicodedata.normalize('NFKD', text)
                       .encode('ascii', 'ignore')
                       .decode('utf-8', 'ignore')
                       .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

# all words 
words = basic_clean(str(train['excerpt'].tolist()))

In [None]:
# plotting ngram
def plotting_ngram(n, top_words):
    ngrams_series = (pd.Series(nltk.ngrams(words, n)).value_counts())[:top_words]
    ngrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
    plt.title(f'20 Most Frequently Occuring {n}grams')
    plt.ylabel(f'{n}gram')
    plt.xlabel('# of Occurances')
    

In [None]:
# bigram
plotting_ngram(2, 10)

In [None]:
# trigram
plotting_ngram(3, 10)

### Word cloud

In [None]:
# combining all words in one paragraph
def combine_all_words():
    comment_words = '' 
    
    for text in train['excerpt']: 
        text = str(text) 
        tokens = text.split() 
        for i in range(len(tokens)): tokens[i] = tokens[i].lower() 
        comment_words += " ".join(tokens)+" "
    
    return comment_words 

In [None]:
# create and plot word cloud
def create_word_cloud():
    stopwords = set(STOPWORDS)
    comment_words = combine_all_words()
    pic = np.array(Image.open(requests.get('http://www.clker.com/cliparts/O/i/x/Y/q/P/yellow-house-hi.png',stream=True).raw))
    wordcloud = WordCloud(width = 800, height = 800, 
                          background_color ='white', 
                          stopwords = stopwords, mask = pic, 
                          min_font_size = 10).generate(comment_words)
    plt.figure(figsize = (10, 10), facecolor = 'white', edgecolor='blue') 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show()

In [None]:
# create_word_cloud()

# Creating a baseline model using BERT 

In [None]:
# parameters

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 1e-2
BETAS = (0.9, 0.999)
MAX_LENGTH = 314
BATCH_SIZE = 16
NUM_EPOCHS = 20
NUM_WORKERS = 6
CHECKPOINT_FILE = 'model.roberta-base.epoch.3.lr.3e-06.wd.0.01.freeze.False.rmse.0.53753'
PIN_MEMORY = True
SAVE_MODEL = True
LOAD_MODEL = False
PRETRAINED_MODEL = 'roberta-base'
FREEZE = False
tokenizer = RobertaTokenizer.from_pretrained(PRETRAINED_MODEL)
#tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)
# tokenizer = transformers.BertTokenizer.from_pretrained("../input/huggingface-bert/bert-base-uncased")

### Split dataset into train and validation 

In [None]:
# train/val
train_text, val_text, train_label, val_label = train_test_split(train['excerpt'], 
                                                                train['target'], 
                                                                random_state=2018, 
                                                                test_size=0.2
                                                               )
# test
test_text = test['excerpt']

In [None]:
print('train shape: ', train_text.shape)
print('val shape: ', val_text.shape)
print('test shape: ', test_text.shape)

### Creading dataset

In [None]:
# creating our dataset
class DRDataset(Dataset):
    def __init__(self, text_col, label_col, train=True):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.train = train

    def __len__(self):
        return self.text_col.shape[0] 

    def __getitem__(self, index):
        # get text and label if test use -1 for label
        text, label = (self.text_col.iloc[index], self.label_col.iloc[index]) if self.train else \
                      (self.text_col.iloc[index], -1)
        
        # tokenize and encode
        tokens = tokenizer.encode_plus(
                    text,
                    padding='max_length',
                    max_length=MAX_LENGTH,
                    truncation=True,
                    return_token_type_ids=False
            )
        
        # seq, mask, and label to tensor
        seq = torch.tensor(tokens['input_ids'])
        mask = torch.tensor(tokens['attention_mask'])
        y = torch.tensor(label) if self.train else torch.tensor(-1)  
        
        return seq, mask, y
    

# """
# Test if everything works ok
# """
# dataset = DRDataset(
#     text_col=train_text,
#     label_col=train_label,
#     train=True,
# )
# loader = DataLoader(
#     dataset=dataset, batch_size=32, num_workers=2, shuffle=True, pin_memory=True
# )

# for seq, mask, y in tqdm(loader):
#     print(seq.shape[0])
#     print(mask.shape[0])
#     print(y.shape[0])
#     import sys
#     sys.exit()


In [None]:
def loading_data():
    train_ds = DRDataset(
            text_col=train_text,
            label_col=train_label,
            train=True,
    )
    val_ds = DRDataset(
            text_col=val_text,
            label_col=val_label,
            train=True,
    )
    test_ds = DRDataset(
            text_col=test_text,
            label_col=-1,
            train=False,
    )
    train_loader = DataLoader(
        train_ds,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_ds,
        batch_size=BATCH_SIZE,
        num_workers=2,
        pin_memory=PIN_MEMORY,
        shuffle=True,
    )
    test_loader = DataLoader(
        test_ds, batch_size=BATCH_SIZE, num_workers=6, shuffle=False
    )
    
    return [train_loader,
            val_loader,
            test_loader
           ]

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

def make_prediction(model, loader, output_csv="submission.csv"):
    preds = []
    model.eval()

    for batch in tqdm(loader):
        batch = [b.to(device=DEVICE) for b in batch]
        seq, mask, _ = batch
        
        with torch.no_grad():
            pred = model(seq, mask)
            preds.extend(pred.squeeze(1).cpu().numpy())
        
    sample['target'] = preds
    print(sample.head())
    sample.to_csv("submission.csv", index=False)
    
    model.train()
    print("Done with predictions")
    
def check_metric(loader, model, data_type, device="cuda"):
    model.eval()
    
    pred_list = []
    actual_list = []
        
    for batch in tqdm(loader):
        batch = [b.to(device=DEVICE) for b in batch]
        seq, mask, y = batch

        with torch.no_grad():
            scores = model(seq, mask)
        
        # rmse
        pred_list.extend(scores.squeeze(1).cpu().numpy())
        actual_list.extend(y.cpu().numpy())
    
    rmse = round(mean_squared_error(actual_list, pred_list)**0.5, 5)
    r2 = round(r2_score(actual_list, pred_list), 5)

    print(
        f"On {data_type} Got  rmse {rmse}"
    )
    
    print(
        f"On {data_type} Got r2 {r2}"
    )
    
    model.train()
    return [r2, rmse]

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer, lr):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

    # If we don't do this then it will just have learning rate of old checkpoint
    # and it will lead to many hours of debugging \:
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

In [None]:
# train
def train_one_epoch(loader, model, optimizer, loss_fn, device):
    losses = []
    loop = tqdm(loader)

    for batch_idx, batch in enumerate(loop):
        
        batch = [b.to(device, non_blocking=True) for b in batch]        
        seq, mask, y = batch
        
        # forward
        scores = model(seq, mask)
        loss = (loss_fn(scores.squeeze(1).float(), y.float())**0.5)

        losses.append(loss.item())

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item())
        
    print(f"Loss average over epoch: {sum(losses)/len(losses)}")


In [None]:
class Roberta_Arch(nn.Module):
    
    def __init__(self, bert):
        super(Roberta_Arch, self).__init__()
        self.bert = bert 
        self.dropout = nn.Dropout(0.1)
        self.relu =  nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,1)

    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

In [None]:
def main(test_mode=False):
    
    train_loader, val_loader, test_loader = loading_data()
    loss_fn = nn.MSELoss()
    
    # import BERT-base pretrained model and freeze all parameters
    roberta = RobertaModel.from_pretrained(PRETRAINED_MODEL)
  #  bert = AutoModel.from_pretrained("../input/huggingface-bert/bert-base-uncased")
    if FREEZE == True:
        for param in roberta.parameters():  param.requires_grad = False
        print('pretrained layers frozen')
            
    model = Roberta_Arch(roberta)
    model = model.to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, betas=BETAS, weight_decay=WEIGHT_DECAY) 
  #  scaler = torch.cuda.amp.GradScaler()

    if LOAD_MODEL:
        load_checkpoint(torch.load(CHECKPOINT_FILE), model, optimizer, LEARNING_RATE)
        print(f"model {CHECKPOINT_FILE} loaded successfully")
        # check metric on val
        check_metric(val_loader, model, 'val', DEVICE)

    if test_mode:
        make_prediction(model, test_loader)
        return

    max_metric_val =  {'epoch': float('inf'), 'metric': float('inf')}
    max_metric_train =  {'epoch': float('inf'), 'metric': float('inf')}
    
    for epoch in range(NUM_EPOCHS):
        train_one_epoch(train_loader, model, optimizer, loss_fn, DEVICE)
        # get on validation
        metric_val = check_metric(val_loader, model, 'val', DEVICE)[1]
        if metric_val < max_metric_val['metric']:
            max_metric_val['metric'] = metric_val
            max_metric_val['epoch'] = epoch 
            
            if SAVE_MODEL:
                checkpoint = {
                    "state_dict": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                }
             #   save_checkpoint(checkpoint, filename=f"model.{PRETRAINED_MODEL}.epoch.{epoch}.lr.{LEARNING_RATE}.wd.{WEIGHT_DECAY}.freeze.{FREEZE}.rmse.{round(metric_val, 5)}")
                save_checkpoint(checkpoint, filename=f"model.{PRETRAINED_MODEL}.lr.{LEARNING_RATE}.wd.{WEIGHT_DECAY}.freeze.{FREEZE}")

        # get on train
        metric_train = check_metric(train_loader, model, 'train', DEVICE)[1]
        if metric_train < max_metric_train['metric']:
            max_metric_train['metric'] = metric_train
            max_metric_train['epoch'] = epoch 

    print(f"Min rmse on validation: {round(max_metric_val['metric'], 5)} attained in epoch {max_metric_val['epoch']}")
    print(f"Min rmse on training: {round(max_metric_train['metric'], 5)} attained in epoch {max_metric_train['epoch']}")

    
    
main(test_mode=False)

In [None]:
'''
Pretrained model weights frozen
using AdamW(model.parameters(), lr = 1e-3)
0.6951 rmse after 10 epochs
0.658 rmse after 10 epochs
0.6403 rmse and r2:  0.5867 after 50 epochs

using AdamW(model.parameters(), LR=2e-5, betas=(0.9, 0.999), weight_decay=1e-2) 
after 20 epochs 0.6973 rmse and 0.5099 r2

Pretrained model weights not frozen
after 11 epoch: 0.5496
got 0.549 public score


pretrained model frozen for 5 epochs
0.75131 attained in epoch 4
after unfreezing 0.54796 after 18 epochs

with lr 3*10-3 and 3*10-4 very unstable

with lr 3*10-6 got rmse 0.57861 after 3 epochs

using roberta got val rmse 0.51861 after 7 epochs


'''