The inspiration to write this notebook is from [here](https://www.kaggle.com/c/commonlitreadabilityprize/discussion/240871). This notebook goes through the implementation of **NON Transformers** models, before going and trying with transformers models.
This notebook is currently in progress, 

This notebook gives submission for below models,

1. TFIDF + Linear Regression
2. TFIDF + Random Forest Regressor
3. TFIDF + SVD + Random Forest regression
4. TFIDF + SVD + Ridge regression
5. Spacy vectors + Ridge Regression
6. Universal Sentence Encoder + Ridge Regression
7. Spacy vectors + Pytorch Regressor [To Do]
8. Spacy vectors + Pytorch Ranker [To Do]
9. Pytorch LSTM [To Do]

### Load Libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge 
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import TruncatedSVD

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()
import string
import re

##parallel processing
import dask

import spacy
from tqdm import tqdm

### Read Data

In [None]:
datapath = "/kaggle/input/commonlitreadabilityprize/"
sub_df = pd.read_csv(f"{datapath}/sample_submission.csv")
train_df = pd.read_csv(f"{datapath}/train.csv")
test_df = pd.read_csv(f"{datapath}/test.csv")
train_df.shape, test_df.shape, sub_df.shape

**Please note that I am getting better public LB score without using any of preprocessing steps compared to using processing steps, for default parameters**

### Text preprocessing (Cleaning)

1. Remove stop words
2. stemming and lemmatization
3. Remove punctuation/change punctuation
4. Remove URLs(links)
5. Remove Numbers 

In [None]:
# %%time
# def preprocess_stemming(text):
#     tokens = word_tokenize(text)
#     stems = []
#     for w in tokens:
#         stems.append(PorterStemmer().stem(w))
#     return stems

# def process_lemmatization(text):
#     tokens = word_tokenize(text)
#     lemmas = []
#     for w in tokens:
#         word1 = LEMMATIZER.lemmatize(w, pos = "n")
#         word2 = LEMMATIZER.lemmatize(word1, pos = "v")
#         word3 = LEMMATIZER.lemmatize(word2, pos = ("a"))
#         lemmas.append(word3)
#     return " ".join(lemmas)

# ### to remove stopwords
# train_df['excerpt'] = train_df['excerpt'].apply(lambda text: " ".join([val for val in word_tokenize(text) if val not in STOPWORDS]))

# # To perform lemmatization
# ##using dask to speedup
# #tasks = train_df['excerpt'].apply(lambda text: (dask.delayed(process_lemmatization)(text)))
# #train_df['excerpt'] = dask.compute(*tasks)
# train_df['excerpt'] = train_df['excerpt'].apply(lambda text: process_lemmatization(text))

# #string punctuation removal
# train_df['excerpt'] = train_df['excerpt'].apply(lambda text: text.translate(str.maketrans('', '', string.punctuation)))
# ## Removes links
# train_df['excerpt'] = train_df['excerpt'].apply(lambda text: re.sub('https?://\S+|www\.\S+', '', text))
# ## Removes numbers
# train_df['excerpt'] = train_df['excerpt'].apply(lambda text: re.sub(r'[^\D\s]','',text))

In [None]:
train_df.head(3)

#### TFIDF

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['excerpt'])
X_test = vectorizer.transform(test_df['excerpt'])
y = train_df['target']

## Models

#### 1. Linear Regression

In [None]:
LR_tfidf = LinearRegression().fit(X_train, y)
LR_tfidf.score(X_train, y)
y_train_lr_pred = LR_tfidf.predict(X_train)
test_lr_pred = LR_tfidf.predict(X_test)
sub_df['target'] = test_lr_pred

print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_train_lr_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_train_lr_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y,y_train_lr_pred)))

print(sub_df.head())
sub_df.to_csv('LR_submission.csv', index=False)

## 1.01 - with preprocessing 
## 0.72 - without preprocessing

#### 2. RandomForestRegressor

In [None]:
%%time
rf_tfidf = RandomForestRegressor().fit(X_train, y)
y_train_rf_pred = rf_tfidf.predict(X_train)
test_rf_pred = rf_tfidf.predict(X_test)
sub_df['target'] = test_rf_pred

print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_train_rf_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_train_rf_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y,y_train_rf_pred)))

print(sub_df.head())
sub_df.to_csv('submission.csv', index=False) 
## 0.92 - with preprocessing 
## 0.81 - without preprocessing

#### 3. TFIDF+SVD+ RandomForestRegressor

##### SVD

In [None]:
%%time
svdT = TruncatedSVD(n_components=400)
svd_X_train = svdT.fit_transform(X_train)
svd_X_test = svdT.transform(X_test)

In [None]:
%%time
svd_rf_tfidf = RandomForestRegressor().fit(svd_X_train, y)
y_train_svdrf_pred = svd_rf_tfidf.predict(svd_X_train)
test_svdrf_pred = svd_rf_tfidf.predict(svd_X_test)
sub_df['target'] = test_svdrf_pred

print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_train_svdrf_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_train_svdrf_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y,y_train_svdrf_pred)))

print(sub_df.head())
sub_df.to_csv('submission.csv', index=False) 
## 0.773 - without preprocessing

#### 4. TFIDF+SVD+Ridge Regressor

In [None]:
%%time
regressor = Ridge(fit_intercept=True, normalize=False)
scores = cross_val_score(regressor, svd_X_train, y, cv=5, 
                         scoring='neg_root_mean_squared_error')
print(f'Average Root mean squared error: {np.abs(np.mean(scores))}')

regressor = regressor.fit(svd_X_train, y)
test_df['target'] = regressor.predict(svd_X_test)
test_df[['id','target']].to_csv('submission.csv', index=False)
## 0.722 - without preprocessing

#### 5.Spacy vectors + Ridge Regression

All Credits to Sumit Kumar @anaverageengineer https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners

In [None]:
%%time
RANDOM_STATE = 147
nlp = spacy.load('en_core_web_lg')

with nlp.disable_pipes():
    X_train = np.vstack([nlp(text).vector for text in tqdm(train_df['excerpt'])])
    y = train_df['target']
    print(f'Shape of Train vectors: {X_train.shape}')

    X_test = np.vstack([nlp(text).vector for text in tqdm(test_df['excerpt'])])
    print(f'Shape of Test vectors: {X_test.shape}')
    
regressor = Ridge(fit_intercept=True, normalize=False)
scores = cross_val_score(regressor, X_train, y, cv=5, 
                         scoring='neg_root_mean_squared_error')
print(f'Average Root mean squared error: {np.abs(np.mean(scores))}')

regressor = regressor.fit(X_train, y)
test_df['target'] = regressor.predict(X_test)
test_df[['id','target']].to_csv('submission.csv', index=False)

#### 6.Universal Sentence Encoder + Ridge Regression

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embed = hub.load("../input/universalsentenceencodermodels/universal-sentence-encoder-models/use-large")
# embeddings = embed([
#     "The quick brown fox jumps over the lazy dog.",
#     "I am a sentence for which I would like to get its embedding"])
# print(embeddings)

In [None]:
%%time
RANDOM_STATE = 147

X_train = np.vstack([embed([text]) for text in tqdm(train_df['excerpt'])])
y = train_df['target']
print(f'Shape of Train vectors: {X_train.shape}')

X_test = np.vstack([embed([text]) for text in tqdm(test_df['excerpt'])])
print(f'Shape of Test vectors: {X_test.shape}')
    
regressor = Ridge(fit_intercept=True, normalize=False)
scores = cross_val_score(regressor, X_train, y, cv=5, 
                         scoring='neg_root_mean_squared_error')
print(f'Average Root mean squared error: {np.abs(np.mean(scores))}')

regressor = regressor.fit(X_train, y)
test_df['target'] = regressor.predict(X_test)
test_df[['id','target']].to_csv('submission.csv', index=False)