## Intro

In this notebook, we are going to check whether augmenting the data changes model's interpretation of the texts dratsically. As the task of this competition is to predict texts readibility, we should be very careful about augmentations we use as they can simply change the target.

To do so, first we will get the hidden_states of a pretrained RoBERTa model (pretrained on the competition's task with simple RMSE regression) for the provided data by the host. Then, we will do the same and get hidden_states of multiple augmented versions of the provided data. 

In this notebook I'm using **EDA** (**Easy Data Augmentation**) from paper: [EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks](https://arxiv.org/abs/1901.11196)

This augmentation technique uses four methods to augment the data which all are really simple:
1. synonym replacement (SR)
2. random insertion (RI)
3. random swap (RS)
4. random deletion (RD)

Look at the figure below for examples of each method:

![](https://i.ibb.co/k6k5HdM/Screenshot-2021-06-15-235611.png)

As you might be thinking now, I first thought that EDA will harm the readibilty and hence change the labels drastically; but I continued on the experiment and found interesting results. Stay tuned :)

## Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import copy
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns


import torch
import torch.nn as nn
import torch.nn.functional as F


from sklearn.model_selection import (train_test_split, 
                                     KFold, 
                                     StratifiedKFold, 
                                     StratifiedShuffleSplit)

from sklearn.metrics import mean_squared_error, accuracy_score

import transformers
from transformers import get_linear_schedule_with_warmup

print(transformers.__version__)
np.random.seed(42)

In [None]:
!cp ../input/huggingfacemodelsnew/roberta_base_tokenizer/* .
!cp ../input/huggingfacemodelsnew/roberta_base_model/* .

## Config

In [None]:
class CFG:
    stratified = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = transformers.AutoTokenizer.from_pretrained('.')
    bert_model = transformers.AutoModel.from_pretrained('.')
    max_length = 300
    batch_size = 32
    dropout = 0.3 # [None, float]
    fc_dim = None # [None, int]
    pool = 'CLS' # ['avg', 'CLS']

## Dataset

In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['target'].values
        self.encodings = tokenizer(list(dataframe['excerpt'].values), 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        
        
    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx]).float()
        return item
    
    def __len__(self):
        return len(self.dataframe)

In [None]:
def make_loaders(dataframe, tokenizer, mode="train", max_length=CFG.max_length):
    dataset = TextDataset(dataframe, tokenizer, mode, max_length=max_length)
    dataloader = torch.utils.data.DataLoader(dataset, 
                                             batch_size=CFG.batch_size, 
                                             shuffle=True if mode == "train" else False,
                                             num_workers=2)
    return dataloader

## Model

In [None]:
class CustomModel(nn.Module):
    def __init__(self,
                 bert_model,
                 bert_hidden_dim=768,
                 fc_dim=CFG.fc_dim,
                 pool=CFG.pool,
                 dropout=CFG.dropout):
        
        super().__init__()
        self.pool = pool
        self.bert_model = bert_model
        self.head = nn.Sequential(
            nn.Dropout(dropout) if dropout else nn.Identity(),
            nn.Linear(bert_hidden_dim, fc_dim) if fc_dim else nn.Identity(),
            nn.Dropout(dropout) if fc_dim and dropout else nn.Identity(),
            nn.Linear(fc_dim, 1) if fc_dim else nn.Linear(bert_hidden_dim, 1)
        )
    
    def forward(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], 
                                 attention_mask=batch['attention_mask'])
        
        last_hidden_state = output.last_hidden_state
        if self.pool == "CLS":
            features = last_hidden_state[:, 0, :]
        elif self.pool == "avg":
            features = last_hidden_state.mean(dim=1)
            
        logits = self.head(features)
        return logits, features

In [None]:
def load_model(fold, mode="score"):
    model = CustomModel(CFG.bert_model).to(CFG.device)
    mode = "scoring_model" if mode == 'score' else "loss"
    model.load_state_dict(torch.load(f'../input/commonlitrobertabase/best_{mode}_fold_{fold}.pt',
                                     map_location=CFG.device))
    model.eval()
    return model

## Folds

In [None]:
def make_folds(n_splits=5, stratified=CFG.stratified):
    df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
    print(f"Building stratified folds: {stratified}")
    if stratified:
        bins = int(np.floor(1 + np.log2(len(df))))
        df['target_bin_label'] = pd.cut(df['target'].values, bins, labels=range(bins))

        kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        for i, (_, valid_idx) in enumerate(kfold.split(X=df, y=df['target_bin_label'].astype(int))):
            df.loc[valid_idx, 'fold'] = int(i)
    else:
        
        kfold = KFold(n_splits=n_splits, shuffle=True, random_state=123)
        for i, (_, valid_idx) in enumerate(kfold.split(df)):
            df.loc[valid_idx, 'fold'] = int(i)
    
    return df

In [None]:
def get_train_valid(df, fold):
    train_df = df[df['fold'] != fold].reset_index(drop=True)
    valid_df = df[df['fold'] == fold].reset_index(drop=True)
    return train_df, valid_df

## Getting Hidden Representations of Main Data

Here I'm loading one of my previously trained models on the competition's data with regression loss. In addition to predictions, I'm also returning the hidden_states of last layer for CLS token or avg of tokens (you can change this behaviour in CFG, but as the model is trained using CLS, it is used here as well)

For simplicity, I'm only getting the predictions and hidden_states for fold number 4 ignoring other samples.

In [None]:
df = make_folds()

In [None]:
fold = 4

_, valid_df = get_train_valid(df, fold=fold)
valid_loader = make_loaders(valid_df, CFG.tokenizer, "valid")
model = load_model(fold)
fold_preds = []
fold_features = []
with torch.no_grad():
    for batch in tqdm(valid_loader):
        batch = {k: v.to(CFG.device) for k, v in batch.items()}
        preds, features = model(batch)
        fold_preds.append(preds)
        fold_features.append(features)

fold_preds = torch.cat(fold_preds).cpu()
fold_features = torch.cat(fold_features, dim=0).cpu()
print(fold_preds.shape, fold_features.shape)

In [None]:
mean_squared_error(fold_preds.numpy(), 
                   df[df['fold'] == fold].target, 
                   squared=False)

## Binning the target and predictions

In order to be able to plot the datapoints more easily, I'm binning the target values into 10 bins and turning it into a classification task. I'm also binning the predictions with the same bins returned by pandas

In [None]:
num_bins = 10
df['class_label'], bins = pd.cut(df['target'].values, num_bins, labels=range(num_bins), retbins=True)
main_labels = df['class_label']
df['idx'] = list(range(len(df)))

In [None]:
df['preds'] = "nan"
df['preds_class_label'] = "nan"
df.loc[df['fold'] == fold, 'preds'] = fold_preds.numpy()
df.loc[df['fold'] == fold, 'preds_class_label'] = pd.cut(fold_preds.numpy().reshape(-1), 
                                                         bins=bins, labels=range(num_bins))

## Checking the accuracy of model on main data

In [None]:
def accuracy(preds, targets):
    return np.mean((preds == targets).astype(float))

accuracy(df.loc[df['fold'] == fold, 'preds_class_label'].values, 
         df.loc[df['fold'] == fold, 'class_label'].astype(int).values)

As you see, although the model had an RMSE of 0.5 on this fold, the accuracy is not that high. But that's okay because we need the classification part only for visualization purposes later in this notebook 

## Loading the augmented dataframe

[In my other notebook](https://www.kaggle.com/moeinshariatnia/commonlit-easy-data-augmentation), I augmented the competitions data using EDA github repo and got 9 augmented versions (using the 4 methods descibed earlier) of each of the samples in the main data of competition; so, the augmented version is 10 (9 + 1) times the previous df. The order of "augmented" dataframe is as follows:

1. idx, excerpt
2. 0, augmented_version_0
3. 0, augmented_version_1
4. ...
5. 0, augmented_version_9
6. 0, main_version
7. 1, augmented_version_0,
8. 1, augmented_version_1
9. ...
10. 1, augmented_version_9
11. 1, main_version

In [None]:
augmented = pd.read_csv("../input/commonliteasydataaugmentation/eda_data.txt", 
                        sep="\t", 
                        header=None, 
                        names=['idx', 'excerpt'])
print(augmented.shape, augmented.shape[0] // df.shape[0])
augmented.head()

I'll drop the main versions as we have them in df

In [None]:
to_drop = [i for i in range(9, len(augmented), 10)]
augmented_only = augmented.drop(to_drop).reset_index(drop=True)

assigning the same target value and fold for each augmented version of a sample

In [None]:
augmented_only = pd.merge(augmented_only, df[['idx', 'class_label', 'fold', 'target']], 
                          how='left', on='idx')
augmented_only.head()

## Getting Hidden Representations of Augmented Data

In [None]:
_, valid_df = get_train_valid(augmented_only, fold=fold)
valid_loader = make_loaders(valid_df, CFG.tokenizer, "valid")
model = load_model(fold)
augmented_fold_preds = []
augmented_fold_features = []
with torch.no_grad():
    for batch in tqdm(valid_loader):
        batch = {k: v.to(CFG.device) for k, v in batch.items()}
        preds, features = model(batch)
        augmented_fold_preds.append(preds)
        augmented_fold_features.append(features)
            
augmented_fold_preds = torch.cat(augmented_fold_preds).cpu()
augmented_fold_features = torch.cat(augmented_fold_features, dim=0).cpu()

In [None]:
mean_squared_error(augmented_fold_preds.numpy(), 
                   augmented_only[augmented_only['fold'] == fold].target, 
                   squared=False)

Oh, it seems that augmenting the data doesn't that much perseve the target :( --> compare it with 0.5 RMSE of main data

it's not that shocking actually because some of the techniques in EDA remove some words and add noise which obviously hurt the texts.

but let's keep on going to visualization part and see if the hidden_states are useless (or not!)

In [None]:
augmented_only['preds'] = "nan"
augmented_only['preds_class_label'] = "nan"
augmented_only.loc[augmented_only['fold'] == fold, 'preds'] = augmented_fold_preds.numpy()
augmented_only.loc[augmented_only['fold'] == fold, 'preds_class_label'] = pd.cut(augmented_fold_preds.numpy().reshape(-1), 
                                                                                 bins=bins, labels=range(num_bins))

In [None]:
accuracy(augmented_only.loc[augmented_only['fold'] == fold, 'preds_class_label'], 
         augmented_only.loc[augmented_only['fold'] == fold, 'class_label'].astype(int))

even worse! accuracy is near random guessing (we have 10 bins or class ~ 0.1 chance by randomly guessing)

## The Cool part: magic with tSNE

Okay, thanks if you've stayed with me until this part. Here I got some excitement after the disappointing results before. I don't know how much we can trust the following results but I wanted to do some experiments like the ones in the paper to see if EDA hurts the embeddings of the model.

In [None]:
import cudf, cuml
import cupy as cp

from cuml.manifold import TSNE, UMAP
import matplotlib.pyplot as plt

In [None]:
fold_labels = df[df['fold'] == fold]['class_label'].values
augmented_fold_labels = augmented_only[augmented_only['fold'] == fold]['class_label'].values

Here, I will first convert the 768 dimensional output of RoBERTa to 2D in order to be able to visualiza it. Then, I'll tag each datapoint with its associated class (0, 1, 2, ..., 9) to recognize it better. As you know the classes are built with binning so that the most positive targets get a class of 9 and the most negative targets get a class of 0 and others in between get their classes in regarding their target

0 --> most negative (-3.x) <br>
9 --> most positive (+1.4x and above)

### Main Datapoints

In [None]:
%%time
tsne = TSNE(n_components=2, perplexity=10, random_state=42)
train_2D = tsne.fit_transform(fold_features.numpy())

markers = ["o", "v", "8", "s", "p", "*", "h", "H", "+", "x", "D"]
plt.rc('legend',**{'fontsize':10})
classes_to_visual = list(set(fold_labels))
C = len(classes_to_visual)
while True:
    if C <= len(markers):
        break
    markers += markers

class_ids = dict(zip(classes_to_visual, range(C)))

plt.figure(figsize=(10, 10), facecolor='white')

for c in classes_to_visual:
    idx = np.array(fold_labels) == c
    plt.plot(train_2D[idx, 0], train_2D[idx, 1], linestyle='None', alpha=1, marker=markers[class_ids[c]],
             markersize=10, label=c)
legend = plt.legend(loc='upper right', shadow=True, title="class id")
plt.title("Main Datapoints")
plt.axis("off")
plt.show()

As you see, for the main datapoints it seems that the model is powerful enough to cluster really positive targets far from really negative ones. You can also see horizontal splits for other classes in somewhat a good order

### Augmented Datapoints

Here I do the same for the augmented ones. As I said, we have 9 augmented versions of the main datapoints. Using the variable "offset" below I'm using only one augmneted version of each main datapoint and plotting it with tSNE

In [None]:
%%time

offset = 7
idxs = [i for i in range(offset, len(augmented_fold_features), 9)]
# idxs = [i for i in range(0, len(augmented_fold_features))]

tsne = TSNE(n_components=2, perplexity=10, random_state=42)
train_2D = tsne.fit_transform(augmented_fold_features[idxs, :].numpy())

markers = ["o", "v", "8", "s", "p", "*", "h", "H", "+", "x", "D"]
plt.rc('legend',**{'fontsize':10})
classes_to_visual = list(set(augmented_fold_labels))
C = len(classes_to_visual)
while True:
    if C <= len(markers):
        break
    markers += markers

class_ids = dict(zip(classes_to_visual, range(C)))

plt.figure(figsize=(10, 10), facecolor='white')

for c in classes_to_visual:
    idx = np.array(augmented_fold_labels[idxs]) == c
    plt.plot(train_2D[idx, 0], train_2D[idx, 1], linestyle='None', alpha=1, marker=markers[class_ids[c]],
             markersize=10, label=c)
legend = plt.legend(loc='upper right', shadow=True, title="class id")
plt.title("Augmented Datapoints")
plt.axis("off")
plt.show()

Here it is!!! <br>
This was the cool thing that I wanted to tell you about. Although the predictions of the model on the augmented version were far off the real targets (terrible RMSE and accuracy), here we see that the hidden_states of the model seem to be representative enough! <br>
They are able to separate the far classes (0, 1, 2 from 8, 9 for example) from each other and we see a somewhat similar pattern as the main data here.

By the way, as noted [in this awesome blog post in Distilpub](https://distill.pub/2016/misread-tsne/) interpreting tSNE results is tricky and we should be very careful about what conclusions we drive with it. My main goal to share this notebook was to spread the this idea of checking if augmentations hurt or not with these tools.

If you want to know more about tSNE, how to interpret the results and the perplexity hyperparameter you definitly want to check that blog post out.

## Checking two specific classes

Inspired by this plot from the paper:

![](https://i.ibb.co/7Qr1r2L/Screenshot-2021-06-16-005757.png)

I wanted to something similar here: to check if the hidden_states of two specific labels for the main data and augmented data lie near each other.

In [None]:
selected_labels = [0, 8] # I chose some obviously far apart classes
mask = (fold_labels == selected_labels[0]) + (fold_labels == selected_labels[1])
selected_fold_features = fold_features[mask, :]
selected_fold_labels = fold_labels[mask]
print(selected_fold_features.shape, selected_fold_labels.shape)

mask = (augmented_fold_labels[idxs] == selected_labels[0]) + (augmented_fold_labels[idxs] == selected_labels[1])
selected_augmented_fold_features = augmented_fold_features[idxs][mask, :]
selected_augmented_fold_labels = augmented_fold_labels[idxs][mask]
print(selected_augmented_fold_features.shape, selected_augmented_fold_labels.shape)

In [None]:
perplexity = 2
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
train_class_0 = tsne.fit_transform(selected_fold_features[selected_fold_labels == selected_labels[0], :].numpy())
perplexity = 5
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
train_class_1 = tsne.fit_transform(selected_fold_features[selected_fold_labels == selected_labels[1], :].numpy())

In [None]:
perplexity = 2
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
augmented_class_0 = tsne.fit_transform(selected_augmented_fold_features[selected_augmented_fold_labels == selected_labels[0], :].numpy())
perplexity = 5
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
augmented_class_1 = tsne.fit_transform(selected_augmented_fold_features[selected_augmented_fold_labels == selected_labels[1], :].numpy())

In [None]:
plt.figure(figsize=(10, 10), facecolor='white')

# , "v", "8", "s"
plt.plot(train_class_0[:, 0], train_class_0[:, 1], linestyle='None', alpha=1, marker="o",
         markersize=10, label='main_class_0')

plt.plot(train_class_1[:, 0], train_class_1[:, 1], linestyle='None', alpha=1, marker="v",
         markersize=10, label='main_class_1')

plt.plot(augmented_class_0[:, 0], augmented_class_0[:, 1], linestyle='None', alpha=1, marker="8",
         markersize=10, label='augmented_class_0')

plt.plot(augmented_class_1[:, 0], augmented_class_1[:, 1], linestyle='None', alpha=1, marker="+",
         markersize=10, label='augmented_class_1')
    
    
legend = plt.legend(loc='upper right', shadow=True, title="mode/class")
plt.title("Plotting main vs augmented datapoints of two specific classes")
plt.axis("off")
plt.show()

Its very tricky how to interpret this result: it seems that the representations of the augmented view are near the main one for individual classes but I don't encourage you to accept this hypothesis as the randomness and perplexity change the plot dramatically and tSNE itslef is not a very relibale tool for this purpose. But I think you agree that the results seem interesting :)