## Video Tutorial

This EDA comes along with a video tutorial, check it out [here](https://www.youtube.com/watch?v=iiwEW-sg9KE&list=PL_49VD9KwQ_OJCqZOeOlSUQKcr1MyifOc&index=2).

In [None]:
# !pip install datasets
# !pip uninstall fsspec -y
# !pip install fsspec==2021.5.0

!pip install \
    /kaggle/input/huggingfaces/datasets/datasets* \
    /kaggle/input/huggingfaces/datasets/huggingface_hub* \
    /kaggle/input/huggingfaces/datasets/tqdm* \
    /kaggle/input/huggingfaces/datasets/xxhash*
!pip uninstall fsspec -y
!pip install /kaggle/input/huggingfaces/datasets/fsspec*

In [None]:
%env WANDB_DISABLED=true

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import torch
from torch import nn

sns.set()
%matplotlib inline

In [None]:
data_dir = '/kaggle/input/commonlitreadabilityprize'
train_data_path = os.path.join(data_dir, 'train.csv')
test_data_path = os.path.join(data_dir, 'test.csv')

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(len(train_df))
print(len(test_df))

In [None]:
huggingface_dir = '/kaggle/input/huggingface-bert'
model_dir = os.path.join(huggingface_dir, 'bert-base-cased')

tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

In [None]:
model.classifier = nn.Linear(768, 1)
model.num_labels = 1

In [None]:
#inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
#labels = torch.tensor([0]).unsqueeze(0)  # Batch size 1
#inputs['labels'] = labels
#outputs = model(**inputs)
#loss = outputs.loss
#logits = outputs.logits

#print(outputs)
#print(loss)
#print(logits)

In [None]:
train_datasets = load_dataset('csv', data_files=[train_data_path])
test_datasets = load_dataset('csv', data_files=[test_data_path])

In [None]:
train_datasets

In [None]:
tokenizer.max_model_input_sizes

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['excerpt'], padding='max_length', truncation=True, max_length=512)
#tokenize everything
f_train_datasets = train_datasets.map(tokenize_function, batched=True)
#removing unnecessary columns
f_train_datasets = f_train_datasets.remove_columns(['id', 'url_legal', 'license', 'excerpt', 'standard_error'])
#renaming target into labels
f_train_datasets = f_train_datasets.rename_column('target', 'labels')
#seed to get same results everytime
f_train_datasets = f_train_datasets.shuffle(seed=42)

#tokenize everything
f_test_datasets = test_datasets.map(tokenize_function, batched=True)
#removing unnecessary columns
f_test_datasets = f_test_datasets.remove_columns(['url_legal', 'license', 'excerpt'])

In [None]:
f_train_datasets

In [None]:
#reserve 10 percent of the data to evaluate after trainning the model
n_samples = len(f_train_datasets['train'])
n_train = int(0.9 * n_samples)

#for trainning the dataset we will take all data in range up to n_train
f_train_dataset = f_train_datasets['train'].select(range(n_train))

#for evaluation we will take datasets after that range
f_eval_dataset = f_train_datasets['train'].select(range(n_train, n_samples))

#define the test datasets
f_test_dataset = f_test_datasets['train']

In [None]:
# this function is used to calculate the accuracy of the model by using RMSE(root mean square error)
# its function is sqrt(mean(real-predicted)^2)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    #make sure the logits and labels are as compact as possible
    logits, labels = logits.squeeze(), labels.squeeze()
    #use the formula mentioned above
    rmse = np.sqrt(np.mean((labels - logits) ** 2))
    return {'RMSE': rmse}

In [None]:
# os.environ['WANDB_API_KEY'] = '19baf7fe1571ebd98eff8449df8e8cbc3d30c634'

In [None]:
training_args = TrainingArguments(
    'training_args',
    num_train_epochs = 5,
    logging_steps = 10,
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    evaluation_strategy = 'steps'
)

trainer = Trainer(
    model = model,
    train_dataset = f_train_dataset,
    eval_dataset = f_eval_dataset,
    compute_metrics = compute_metrics,
    args = training_args
)

In [None]:
# for x in f_train_dataset:
#     a = len(x['input_ids'])
#     print(a)

# # [len(v) for v in f_train_dataset[0].values()]

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# model.save_pretrained('model_v1')

In [None]:
pred_output = trainer.predict(f_test_dataset)
pred_targets = pred_output.predictions.squeeze()
pred_ids = f_test_dataset['id']

output = pd.DataFrame({
    'id': pred_ids,
    'target': pred_targets
})

output.to_csv('submission.csv', index=False)