# **CommonLit Readability**

### **Import libraries**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
import torch
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils import data
from torch import nn, optim
from collections import defaultdict
import warnings
%matplotlib inline

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
warnings.filterwarnings("ignore")

### **Load data**

#### Load and describe the training data

In [None]:
filename = "../input/commonlitreadabilityprize/train.csv"
df_train = pd.read_csv(filename)
df_train = df_train.drop(["url_legal", "license"], axis = 1)

In [None]:
df_train.head(1)

In [None]:
df_train.info()

In [None]:
df_train['target'].describe()

In [None]:
std = df_train['target'].std()
mean = df_train['target'].mean()
print('mean:', mean)
print('std: ', std)

#### Load the test data

In [None]:
filename = "../input/commonlitreadabilityprize/test.csv"
df_test = pd.read_csv(filename)
df_test = df_test.drop(["url_legal", "license"], axis = 1)

In [None]:
df_test.head(1)

### **Transform data**

#### Remove new lines

In [None]:
def to_string(row_text):
  lines = row_text.split('\n')
  string = ""
  for line in lines:
    string = string + " " + line
  return string

#### Remove new lines from the training data

In [None]:
df_train['excerpt'] = df_train['excerpt'].apply(to_string)

In [None]:
df_train.head(1)

#### Remove new lines from the test data

In [None]:
df_test['excerpt'] = df_test['excerpt'].apply(to_string)

In [None]:
df_test.head(1)

### **Exploratory Data Analysis**

In [None]:
sns.set_style("darkgrid")
rcParams['figure.figsize'] = 9, 6

In [None]:
sns.kdeplot(df_train.target, shade=True, color="r")
plt.xlabel('Average ratings')
plt.show()

In [None]:
sns.kdeplot(df_train.standard_error, shade=True, color="r")
plt.xlabel('Standard errors')
plt.show()

In [None]:
x=df_train['target']
y=df_train['standard_error']
plt.scatter(x=x, y=y)
plt.annotate("remove", xy=(0, 0), arrowprops=dict(facecolor='orange', shrink=0.05), 
             xytext=(0.6, 0.3), textcoords='axes fraction', fontsize=12, weight='bold',
             horizontalalignment='right', verticalalignment='top', color='orange')
plt.xlabel('Targets')
plt.ylabel('Standard errors')
plt.show()

In [None]:
ind = df_train[df_train['target'] == 0].index
df_train = df_train.drop(ind)

In [None]:
lower_bound = mean - std
upper_bound = mean + std
lower_bound, upper_bound

In [None]:
plt.scatter(x=df_train['target'], y=df_train['standard_error'])

plt.axvline(x=lower_bound, ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')
plt.axvline(x=upper_bound, ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')

plt.xlabel('Targets')
plt.ylabel('Standard errors')
plt.show()

In [None]:
min_value = df_train["target"].min()
max_value = df_train["target"].max()
print("min: ",  min_value)
print("max: ",  max_value)

### **Choose sequence length**

In [None]:
PRE_TRAINED_MODEL = "bert-base-uncased"

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained(PRE_TRAINED_MODEL)

In [None]:
%%time

for df in [("training data", df_train), ("test data", df_test)]:
  excerpt_tokens = []
  for excerpt in df[1].excerpt:
    tokens = tokenizer.tokenize(excerpt)
    excerpt_tokens.append(len(tokens))

  min_tokens = min(excerpt_tokens)
  max_tokens = max(excerpt_tokens)
  print(df[0],":")
  print("-" * 100)
  print('min ve max tokens:', min_tokens, max_tokens)
  print('\n')

  sns.distplot(excerpt_tokens)
  plt.xlim([min_tokens-50, max_tokens+50]);
  plt.xlabel('Token count');
  plt.show()

  print('\n')


### **Set parameters**

In [None]:
BS = 4
MAX_LEN = 320
EPOCHS = 5
RANDOM_SEED = 42
BIAS = False
SPLIT_RATIO = 0.1
DROPOUT = 0.3

WD = 0
LEARNING_RATE = 2e-5
NUM_WARMUP_PERCENTAGE = 0.1

In [None]:
def define_scheduler(data_loader):

  total_steps = len(data_loader) * EPOCHS

  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=int(NUM_WARMUP_PERCENTAGE*total_steps),
      num_training_steps=total_steps
  )

  return scheduler

### **Train-validation split**

In [None]:
def split_train_val(full_data, SPLIT_RATIO):
  df_train, df_val = train_test_split(
      full_data,
      test_size=SPLIT_RATIO,
      random_state=RANDOM_SEED
      )

  print("training data:", df_train.shape)
  print("validation data:", df_val.shape)

  return df_train, df_val

### **Encoding**

#### Encode train excerpts.

In [None]:
class ExcerptDataset(data.Dataset):
  def __init__(self, ids, excerpts, targets, tokenizer, max_len):
    self.ids = ids
    self.excerpts = excerpts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.excerpts)

  def __getitem__(self, item):
    id = str(self.ids[item])
    excerpt = str(self.excerpts[item])
    target = self.targets[item]

    encoding = tokenizer.encode_plus(
        excerpt,
        max_length=self.max_len,
        truncation=True,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True, 
        return_token_type_ids=False,
        return_tensors='pt'
    )

    return {
      'id': id,
      'excerpt_text': excerpt,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.float)
    }

In [None]:
df_train_set, df_val_set = split_train_val(df_train, SPLIT_RATIO)

### **Create data loaders**

In [None]:
def create_data_loader(df, tokenizer, MAX_LEN, batch_size=4, shuffle=True):
  dataset = ExcerptDataset(
    ids = df.id.to_numpy(),
    excerpts=df.excerpt.to_numpy(),
    targets=df.target.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
  )

  data_loader = data.DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=2
  )

  data_item = next(iter(data_loader))

  print(data_item.keys())
  print('\n')
  print(data_item['input_ids'].shape)
  print(data_item['attention_mask'].shape)
  print(data_item['targets'].shape)
  print('\n')
  print("input_ids:", data_item['input_ids'])
  print("attention_mask:", data_item['attention_mask'])
  print("targets:", data_item['targets'])

  return data_loader

In [None]:
train_data_loader = create_data_loader(df_train_set, tokenizer, MAX_LEN, batch_size=BS, shuffle=True)
val_data_loader = create_data_loader(df_val_set, tokenizer, MAX_LEN, batch_size=BS, shuffle=True)

### **Regression**

#### Regression model

In [None]:
class ExcerptRegression(nn.Module):

  def __init__(self, DROPOUT):
    super(ExcerptRegression, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL)
    self.drop = nn.Dropout(p=DROPOUT)
    self.linear = nn.Linear(self.bert.config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    _, output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict = False)
    output = self.drop(output)
    output = self.linear(output)

    return output

In [None]:
reg_model = ExcerptRegression(DROPOUT)
reg_model = reg_model.to(device)

#### Define loss function RMSE

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.mse = nn.MSELoss()

    def forward(self,pred,y):
        loss_fn = torch.sqrt(self.mse(pred, y))
        return loss_fn

#### Optimizer

In [None]:
optimizer = AdamW(reg_model.parameters(), lr=LEARNING_RATE, correct_bias=BIAS, weight_decay=WD)

loss_fn = RMSELoss().to(device)

### **Training the data**

In [None]:
def train(reg_model, data_loader, loss_fn, optimizer, device):

  scheduler = define_scheduler(data_loader)
  reg_model = reg_model.train()
  losses = []

  for dl in data_loader:
    input_ids = dl["input_ids"].to(device)
    attention_mask = dl["attention_mask"].to(device)
    targets = dl["targets"].to(device)
    
    outputs = reg_model(
      input_ids=input_ids,
      attention_mask=attention_mask,
    )

    loss = loss_fn(outputs.view(-1), targets.view(-1)) # pred, y
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(reg_model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(losses)

### **Evaluation**

In [None]:
def evaluate(reg_model, data_loader, loss_fn, device):

  reg_model = reg_model.eval()
  losses = []

  with torch.no_grad():

    for dl in data_loader:
      input_ids = dl["input_ids"].to(device)
      attention_mask = dl["attention_mask"].to(device)
      targets = dl["targets"].to(device)

      outputs = reg_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
      )

      loss = loss_fn(outputs.view(-1), targets.view(-1)) 
      losses.append(loss.item())

  return np.mean(losses)

### **Run the model**

In [None]:
%%time

epoch_results = defaultdict(list)
min_loss = 1000.0

print("TRAINING RESULTS:")
print('*' * 50)
print('\n')

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 50)

  train_loss = train(reg_model, train_data_loader, loss_fn, optimizer, device)

  print(f'Training   loss: {train_loss}')
  print('\n')
 
  val_loss = evaluate(reg_model, val_data_loader, loss_fn, device)

  print(f'Validation loss: {val_loss}')
  print('\n')

  epoch_results['train_loss'].append(train_loss)
  epoch_results['validation_loss'].append(val_loss)
  
  if val_loss < min_loss:
    name = "1set_best_model_1.pt"
    torch.save(reg_model.state_dict(), './' + name)
    min_loss = val_loss

### **Plot the results**

In [None]:
train_loss = epoch_results['train_loss']
validation_loss = epoch_results['validation_loss']

min_train_loss= min(train_loss)
min_val_lost = min(validation_loss)

val_index = validation_loss.index(min_val_lost)

print("model: ", PRE_TRAINED_MODEL)
print("batch size:", BS)
print("maximum sequence length:", MAX_LEN)
print("number of epochs:", EPOCHS)
print("random seed:", RANDOM_SEED)
print("learning rate:", LEARNING_RATE)
print("weight decay:", WD)
print("warmup percentage:", NUM_WARMUP_PERCENTAGE)
print("bias correction:", BIAS)
print("dropout:", DROPOUT)
print("split ratio:", SPLIT_RATIO)
print('\n')
print("minimum train loss:", min_train_loss)
print("minimum validation loss:", min_val_lost)

In [None]:
x = range(1, EPOCHS+1)
y1 = train_loss
y2 = validation_loss

plt.plot(x, y1, label='train loss')
plt.plot(x, y2, label='validation loss')

plt.title('Loss functions for training and validation data', fontsize=15)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend()
plt.axvline(x=x[val_index], ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')
plt.axhline(y=min_val_lost, linewidth=1.5, linestyle="--", color='darkorchid')

plt.show()

### **Predict the test data**

#### Load the best model

In [None]:
DROPOUT = 0

In [None]:
reg_model = ExcerptRegression(DROPOUT)
reg_model = reg_model.to(device)

#### Create data loader of the test data

In [None]:
df_test['target'] = 100

test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, batch_size=4, shuffle=False)

Predict

In [None]:
def find_predictions(reg_model, data_loader):

  id_data = []
  excerpt_data = []
  output_data = []

  reg_model = reg_model.eval()

  with torch.no_grad():
  
    for dl in data_loader:

      id = dl['id']
      excerpt = dl['excerpt_text']
      input_ids = dl["input_ids"].to(device)
      attention_mask = dl["attention_mask"].to(device)

      outputs = reg_model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      outputs = outputs.flatten().tolist()
      output_data.extend(outputs)
      excerpt_data.extend(excerpt)
      id_data.extend(id)

  return id_data, excerpt_data, output_data

In [None]:
id_data, excerpt_data, output_data = find_predictions(reg_model, test_data_loader)

In [None]:
predictions_df = pd.DataFrame(list(zip(id_data, excerpt_data, output_data)), columns =['id', 'excerpt', 'target'])
predictions_df

In [None]:
predictions_df = predictions_df.drop("excerpt", axis = 1)
predictions_df

#### **Save the results**

In [None]:
predictions_df.to_csv('./submission.csv', index=False, float_format='%.6f')