In [None]:
import numpy as np
import pandas as pd
import re

#SKlearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import BayesianRidge

#NLTK
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import keras.backend as K
from keras.callbacks import ModelCheckpoint
from keras.models import Model
from keras.models import load_model

#TF Transformers
from transformers import AutoTokenizer,TFAutoModel

In [None]:
def preProcesstext(tweet_dataframe):

    stop_words=pd.read_csv('../input/stop-words/stopwords.csv',names=['stopword'])
    stop_words=stop_words.iloc[:,0].values.tolist()
    
    #Seperating out text from the data
    tweet=tweet_dataframe['excerpt'].values

    #Using Regex functions to remove non-essential characters
    t1=[]
    for i in range(len(tweet)):
        t1.append(re.sub('[^a-zA-Z]+',' ',str(tweet[i])))
    
    #Using Regex functions to remove stop words and words shorter than 3 characters
    for i in range(len(t1)):
        t1[i] = ' '.join(word for word in t1[i].split() if word not in stop_words)
        t1[i]=t1[i].lower()
        t1[i]=' '.join(word for word in t1[i].split() if len(word)>3)
        
    corpus_stemmed = []
    
    #Lementing words
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    for d in t1:
        words = pd.Series(wordpunct_tokenize(d),dtype='object')
        stemmed_words = words.apply(wordnet_lemmatizer.lemmatize)
        corpus_stemmed.append(' '.join(list(stemmed_words)))
    
    #return corpus_stemmed
    return t1

In [None]:
BASE_MODEL = '../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
transformer_model = TFAutoModel.from_pretrained(BASE_MODEL,num_labels=1)

In [None]:
#Tokenize input data to generate dictionary of input ids and attention masks
def get_data(train_text):
    x_train=preProcesstext(train_text)
    tokenized = tokenizer(x_train, padding=True, return_tensors="np")
    return {feat: tokenized[feat] for feat in tokenizer.model_input_names}

#Get embeddings
def get_embedding(X_train):
    embedding=[]
    for i in range(0,len(X_train['input_ids']),100):
        X = transformer_model(input_ids=X_train['input_ids'][i:i+100], attention_mask=X_train['attention_mask'][i:i+100])[0][:,0,:].numpy()
        embedding.extend(X)
    embedding=np.array(embedding)
    return embedding

In [None]:
train_text=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_text=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
X_train = get_data(train_text)
X_test = get_data(test_text)

#Scaling applied to target
sc = StandardScaler()
y=train_text['target'].values
y=sc.fit_transform(y.reshape(-1,1))

In [None]:
#Embeddings for train and test dataset
embedding_train=get_embedding(X_train)    
embedding_test=get_embedding(X_test)

In [None]:
#Multiple regressors were used out of which Bayesian Ridge performed the best
model = BayesianRidge()
model.fit(embedding_train,y)

#Predicting target for test dataset
y_pred = model.predict(embedding_test)
y_pred=sc.inverse_transform(y_pred.reshape(-1,1))

In [None]:
df=pd.DataFrame(test_text['id'],columns=['id'])
df['target']=y_pred
df.to_csv('./submission.csv', index=False)

**Next Steps:**

* Applying grid search CV to get optimal hyperparameters for Bayesian Ridge
* Better preprocessing of text

Open to more suggestions!