**Hugging Face Model**

In the script I have tried to do prediction using hugging face model after performing text cleaning. I have taken help from other scripts on kaggle and tried to improve the solution.
To use the scripts you need to add following data to run file offline(without internet on kaggle):
1. distillbert-huggingface-model
2. hf-datasets
3. nltk-stopwords

In the script I have performed the following steps:
* EDA and text cleaning
* Custom K-fold
* Setting up Hugging face model
* Prediction

Please comment and let me know your suggestions and what additional could have been done. Thank you

Used following script for reference.
"https://www.kaggle.com/thedrcat/commonlit-hf-minimalistic-example"

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import nltk
import regex as re
from nltk.stem import WordNetLemmatizer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import string
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
stopwords_path = "../input/stopwords/stopwords/english"
stopwords = np.loadtxt(stopwords_path, dtype=str)

In [None]:
#to install datasets library
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

In [None]:
train = pd.read_csv(r'/kaggle/input/commonlitreadabilityprize/train.csv')
test = pd.read_csv(r'/kaggle/input/commonlitreadabilityprize/test.csv')
sb = pd.read_csv(r'/kaggle/input/commonlitreadabilityprize/sample_submission.csv')

**EDA**
* Almost 70 percent of values in **url_legal and licence are null**

In [None]:
train.isna().sum()/train.shape[0]

In [None]:
sns.distplot(train["target"]).set(title='Distribution of Target Variable')


**Cleaning Texts**`

In [None]:
#Removing punchuation from sentences and stop words

# nltk.download('stopwords', quiet=True)
# stopwords = nltk.corpus.stopwords.words('english')
def remove_pun_stopwords(text):
    text = re.sub(r'[^\w\s]','',text)
    text = [i.lower() for i in text.lower().split() if i not in stopwords]
    return(' '.join(text))




To make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.


In [None]:
def clean_text(text):

    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text



In [None]:

def clean(text):
    text = clean_text(text)
#     text = remove_pun_stopwords(text)
    return text

In [None]:
train['excerpt'] = train['excerpt'].apply(clean)
test['excerpt'] = test['excerpt'].apply(clean)

In [None]:
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    
    text = [lemmatizer.lemmatize(i) for i in text.split()]
    return(' '.join(text))



In [None]:
train['excerpt'] = train['excerpt'].apply(word_lemmatizer)
test['excerpt'] = test['excerpt'].apply(word_lemmatizer)

* Creating custom function to create k-folds

In [None]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1,random_state=10).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
#Creating K - folds for training and validation
train = create_folds(train, num_splits=5)
train = train.rename(columns={'target':'label'})

**Using Standatd Scaler on target_variable**

In [None]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train['label'] = scaler.fit_transform(train[['label']])

In [None]:
# disable W&B logging as we don't have access to the internet
%env WANDB_DISABLED=True

In [None]:
#to load pretrained model
model_checkpoint = '../input/distillbert-huggingface-model'
batch_size = 16
max_length = 256

* Creating train and validation datasets

In [None]:
train_dataset = Dataset.from_pandas(train[train.kfold != 0].reset_index(drop=True))
valid_dataset = Dataset.from_pandas(train[train.kfold == 0].reset_index(drop=True))

In [None]:
#Creating tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def tokenize(batch): return tokenizer(batch['excerpt'], padding=True,truncation=True, max_length=max_length)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

In [None]:
columns_to_return = ['input_ids', 'label', 'attention_mask']
train_dataset.set_format(type='torch', columns=columns_to_return)
valid_dataset.set_format(type='torch', columns=columns_to_return)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1)

In [None]:
def compute_metrics(pred):
    targs = pred.label_ids
    preds = pred.predictions
    rmse = mean_squared_error(targs, preds, squared=False)
    return {
        'rmse': rmse,
    }

args = TrainingArguments(
    "outputs_dir",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    fp16=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    seed=7,
    weight_decay=0.005,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()


In [None]:
test = test.rename(columns={'target':'label'})
test['label'] = 1

test_dataset = Dataset.from_pandas(test)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

In [None]:
columns_to_return = ['input_ids', 'label', 'attention_mask']
test_dataset.set_format(type='torch', columns=columns_to_return)

In [None]:
test_preds = trainer.predict(test_dataset)
test_preds= scaler.inverse_transform(pd.DataFrame(test_preds[0].reshape(1,-1)[0]))

In [None]:
test_ids = test['id'].values

submission = pd.DataFrame({
    'id': test_ids,
    'target': test_preds.reshape(1,-1)[0]
})



In [None]:
submission.to_csv('submission.csv', index=False)
