### Linear Regression model vs transformer model (Roberta-base) performance 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import transformers
import missingno as msno
import re
import spacy
import nltk
from wordcloud import WordCloud

from sklearn.model_selection import KFold
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import RootMeanSquaredError
from nltk.tokenize import sent_tokenize, word_tokenize 
from transformers import TFAutoModel,AutoTokenizer,TFAutoModelForSequenceClassification
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

sns.set_palette('husl')





In [None]:
print(tf.__version__)

In [None]:
df=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df.head()

In [None]:
df.info()

In [None]:
print(f'length of df : {len(df)}')

In [None]:
msno.bar(df,sort="ascending", figsize=(10,5))

In [None]:
df=df.iloc[:,1:]

In [None]:
def my_plot(df,row):
  idx=0
  j=0
  feat=['target','standard_error']
  plt.rcParams['figure.figsize'] = (15,5)
  fig,axes=plt.subplots(row,2)
  plt.subplots_adjust(top = 1.95)  
  for i in range(row):
      axes[i,j].axvline(df[feat[idx]].mean(), linestyle=':', linewidth=2)
      sns.kdeplot(df[feat[idx]],color='red',ax=axes[i,j])   
      axes[i,j].set_title(feat[idx])
      j+=1
      sns.violinplot(df[feat[idx]],color='red',ax=axes[i,j])     
      axes[i,j].set_title(feat[idx])
      idx+=1
      j=0



In [None]:
my_plot(df,2)

In [None]:
df[df['url_legal'].notnull()].head()   #url legal

In [None]:
len(df['license'].unique())    # licenses

In [None]:
df['license'][df['license'].notnull()]

In [None]:
plt.figure(figsize=(20,10))
plt.xticks(rotation=90)
sns.countplot(df['license'][df['license'].notnull()])

In [None]:
df=df[['excerpt','target']]                 
print(f'range of target values : ({df.target.min()},{df.target.max()})')

In [None]:
df=df.rename(columns={'excerpt':'text'})
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")

In [None]:
df['target'].skew

### word frequency excerpt visualization in word cloud

In [None]:
cloud=WordCloud(background_color = 'black',stopwords=stopwords,max_words=200,max_font_size = 40,scale=3).generate(str(df['text']))

title='word count'
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(top = 2.25)
fig.suptitle(title, fontsize = 20)
plt.imshow(cloud)

In [None]:
def text_clean(text):
  pattern=re.compile("[^a-zA-Z]|https?://\S+|www\.\S+")
  return pattern.sub(r' ',text)
  

In [None]:
x_data=df['text'].apply(lambda text:text_clean(text).strip())

In [None]:
x_data=[" ".join(data.split()) for data in x_data]
x_data[:10]

In [None]:
#using spacy we do lemmatization and singularization
train_data=[]
nlp=spacy.load('en_core_web_sm')
for data in x_data:
  doc = nlp(data)
  train_data.append(" ".join([str(token.lemma_) for token in doc]))

In [None]:
train_data=[' '.join([word for word in data.split() if '-PRON-'!=word]) for data in train_data]
train_data=[' '.join([word for word in data.split() if word not in stopwords]) for data in train_data]
y_data=df['target'].values

In [None]:
x_data[:2],y_data[:2]

In [None]:
train_data[:2],y_data[:2]

### Dataset prep for linear reg and Roberta

In [None]:
class DATASET:
  def __init__(self,train_data,y_data):
    
    self.train_data=train_data
    self.y_data=y_data
  
  def __call__(self,pad_sequences,train_test_split,model_name,roberta_tokenizer):

    if model_name=='LR':

      x_train,x_test,y_train,y_test=train_test_split(self.train_data,self.y_data,test_size=0.3)
      tfidf=TfidfVectorizer(analyzer='word', ngram_range=(1,3))
      table_c=tfidf.fit_transform(list(x_train)+list(x_test))
      train_table_data=tfidf.transform(x_train)
      test_table_data=tfidf.transform(x_test)
      
      return train_table_data,test_table_data,y_train,y_test

    elif model_name=='roberta':
      sequences=[]
      length=[]
      for text in self.train_data:
        tokens=roberta_tokenizer.encode(text,add_special_tokens=True, truncation=True)
        sequences.append(tokens)

      
      roberta_data=pad_sequences(sequences,maxlen=200,padding='pre',value=roberta_tokenizer.encode('<pad>')[1])  #roberta_tokenizer.encode('<pad>')[1] is the token value for padding
      return roberta_data,self.y_data


In [None]:
roberta_tokenizer=AutoTokenizer.from_pretrained('roberta-base')

In [None]:
data=DATASET(train_data,y_data)
x_train_data,x_test_data,y_train_data,y_test_data=data(pad_sequences,train_test_split,'LR',roberta_tokenizer)
x_roberta_data,y_roberta_data=data(pad_sequences,train_test_split,'roberta',roberta_tokenizer)

In [None]:
print(f'for linear reg training sample set: {x_train_data.shape,y_train_data.shape} and for roberta whole dataset : {x_roberta_data.shape,y_roberta_data.shape}')

## Linear reg model

In [None]:
class Linear_Model(tf.keras.Model):
  def __init__(self,x_train_data,y_train_data):
    self.x_train_data=x_train_data
    self.y_train_data=y_train_data
    self.lreg=LinearRegression()

  def linear_regression_result(self,x_test_data):
    self.lreg.fit(self.x_train_data,self.y_train_data)     #train                    
    
    #predict
    return self.lreg.predict(x_test_data)

  

In [None]:
linear_model=Linear_Model(x_train_data,y_train_data)
lr_y_pred=linear_model.linear_regression_result(x_test_data)

In [None]:
#performance metric evaluation on linear regression
from sklearn.metrics import mean_squared_error
print(f'RMSE Score {mean_squared_error(lr_y_pred,y_test_data,squared=False)}')

### Roberta-base 

In [None]:
class Custom_roberta(tf.keras.Model):

  def __init__(self):
    super(Custom_roberta,self).__init__()
    self.roberta_model = TFAutoModelForSequenceClassification.from_pretrained('roberta-base',output_hidden_states=False, output_attentions=False, num_labels=1)

  def call(self,input_ids):
    x=self.roberta_model(input_ids)
    

    return x

In [None]:
def loss_func(y_true,y_pred):  #root mean sqruared error (RMSE) 
  return tf.sqrt(tf.reduce_mean(tf.square(y_pred-y_true)))



In [None]:
#with fit function use simple scheduler
#constant lr for first 10 epochs and then lr is decreased exponentially 

def schedule(epochs,lr):
    if epochs<10:
        return lr
        
    else:
        return lr * tf.math.exp(-0.01)
    

In [None]:
#train_Size of the data for training and dev

train_size=int(0.8*(len(x_roberta_data)))

In [None]:
x_roberta_data.shape,y_roberta_data.shape

In [None]:
#k folds
def train_in_folds(x_roberta_data,y_roberta_data,folds):

    # initiate the kfold class from model_selection module
    kf = KFold(n_splits=folds,shuffle=True)
    
    for (fold, (train_index, test_index)) in enumerate(kf.split(x_roberta_data)):
        print(f'for fold : {fold+1}\n')
        x_train,x_test=x_roberta_data[train_index],x_roberta_data[test_index]
        y_train,y_test=y_roberta_data[train_index],y_roberta_data[test_index]

        x_test,y_test=tf.convert_to_tensor(x_test),tf.convert_to_tensor(y_test)
        x_train,y_train=tf.convert_to_tensor(x_train),tf.convert_to_tensor(y_train)
        

        model=Custom_roberta()
        optimizer = tf.keras.optimizers.Adam(lr=2e-5)
        callback_1=tf.keras.callbacks.LearningRateScheduler(schedule)
        model.compile(optimizer=optimizer,loss='mse',metrics=[RootMeanSquaredError()])
        callback_2=tf.keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error',mode='min',restore_best_weights=True,patience=2)
        

        r=model.fit(x_train,y_train,batch_size=10,validation_data=(x_test,y_test),epochs=6,callbacks=[callback_1,callback_2],verbose=1)
        print(f"best rmse : {np.min(r.history['val_root_mean_squared_error']):.4f}")
        print('\n')   



In [None]:
train_in_folds(x_roberta_data,y_roberta_data,folds=5)

In [None]:
#### supervised learning using transfer learming , hence used for downstream task only and warnings can be ignored

We can clearly see that transformer model outperforming simple linear model in terms of performance in terms of RMSE metric.In next Update i will try to optimize the RMSE Score