# CommonLit Readability Prize
- This notebook covers some of the most basic ML models and pre-processing technqiues that a beginner can approach easily
- Some of the basic pre-processing includes removing stop-words, converting all the text to lower-case, removing links and converting short representations like won't, couldn't, etc.
- For converting text to vectors, the notebook includes Bag of words, Binary Bag of words, TF-IDF, Average Word2Vec and TF-IDF weighted Word2Vec.
- In terms of the ML models, the notebook covers Linear Regression, AdaBoost (with RandomizedSearchCV), Bagging Regressor, Extra Trees Regressor, Gradient Boosting Regressor, Random Forest, Histogram Gradient Boosting Regressor.
- Also, the notebook uses LazyPredict just to see the performance of different regression models.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install bs4

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from gensim.models import Word2Vec
from scipy import sparse

In [None]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df.info()

In [None]:
df.drop(["url_legal", "license", "standard_error", "id"], axis=1, inplace=True)
print(df.shape)

In [None]:
df.drop_duplicates(subset={"excerpt"}, keep='first', inplace=True)
exc = df["excerpt"]
print(df.shape)

# Pre-Processing

In [None]:
# Printing some random excerpts
print(exc[0])
print("="*50)
print(exc[5])

In [None]:
import re

def decontracted(phrase):
    # Specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # General
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
# https://gist.github.com/sebleier/554280
# We are removing the words from the stop words list: 'no', 'nor', 'not'

stopwords = set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [None]:
# Combining all the above steps
from tqdm import tqdm
from bs4 import BeautifulSoup
preprocessed_excerpts = []

# tqdm is for printing the status bar
for sen in tqdm(exc):
    sen = re.sub(r"http\S+", "", sen)
    sen = BeautifulSoup(sen, 'lxml').get_text()
    sen = decontracted(sen)
    sen = re.sub("\S*\d\S*", "", sen).strip()
    sen = re.sub('[^A-Za-z]+', ' ', sen)
    sen = ' '.join(e.lower() for e in sen.split() if e.lower() not in stopwords)
    preprocessed_excerpts.append(sen.strip())

# Splitting the Dataset

In [None]:
df["excerpt"] = preprocessed_excerpts
X = df.drop(["target"], axis=1, inplace=False)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Featurization (Bag of Words)

In [None]:
# count_vect = CountVectorizer() 
# exc_train = X_train["excerpt"]
# exc_train = count_vect.fit_transform(exc_train)
# print(type(exc_train), exc_train.shape)

# exc_test = X_test["excerpt"]
# exc_test = count_vect.transform(exc_test)
# print(type(exc_test), exc_test.shape)

# Featurization (Binary Bag of Words)

In [None]:
count_vect = CountVectorizer(binary=True) 
exc_train = X_train["excerpt"]
exc_train = count_vect.fit_transform(exc_train)
print(type(exc_train), exc_train.shape)

exc_test = X_test["excerpt"]
exc_test = count_vect.transform(exc_test)
print(type(exc_test), exc_test.shape)

# Featurization (TF-IDF)

In [None]:
# tf_vect = TfidfVectorizer() 
# exc_train = X_train["excerpt"]
# exc_train = tf_vect.fit_transform(exc_train)
# print(type(exc_train), exc_train.shape)

# exc_train = exc_train.todense()
# print(type(exc_train), exc_train.shape)

# exc_test = X_test["excerpt"]
# exc_test = tf_vect.transform(exc_test)
# print(type(exc_test), exc_test.shape)

# exc_test = exc_test.todense()
# print(type(exc_test), exc_test.shape)

# Featurization (Word2Vec)

In [None]:
# list_of_exc_train = []
# exc_train = X_train["excerpt"]
# for exc in exc_train:
#     list_of_exc_train.append(exc.split())
    
# list_of_exc_test = []
# exc_test = X_test["excerpt"]
# for exc in exc_test:
#     list_of_exc_test.append(exc.split())
    
# # Training W2V model
# w2v_model = Word2Vec(list_of_exc_train, min_count=5, vector_size=300, workers=4, epochs=50)
# w2v_words = list(w2v_model.wv.key_to_index)

# Average W2V

In [None]:
# # Converting exc_train from text to vectors
# sent_vectors = []
# for sent in tqdm(list_of_exc_train):
#     sent_vec = np.zeros(300) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
#     cnt_words =0; # num of words with a valid vector in the sentence/review
#     for word in sent: # for each word in a review/sentence
#         if word in w2v_words:
#             vec = w2v_model.wv[word]
#             sent_vec += vec
#             cnt_words += 1
#     if cnt_words != 0:
#         sent_vec /= cnt_words
#     sent_vectors.append(sent_vec)

# exc_train = sparse.csr_matrix(sent_vectors).toarray()
# print(type(exc_train), exc_train.shape)

In [None]:
# # Converting exc_test from text to vectors
# sent_vectors = []
# for sent in tqdm(list_of_exc_test):
#     sent_vec = np.zeros(300) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
#     cnt_words =0; # num of words with a valid vector in the sentence/review
#     for word in sent: # for each word in a review/sentence
#         if word in w2v_words:
#             vec = w2v_model.wv[word]
#             sent_vec += vec
#             cnt_words += 1
#     if cnt_words != 0:
#         sent_vec /= cnt_words
#     sent_vectors.append(sent_vec)

# exc_test = sparse.csr_matrix(sent_vectors).toarray()
# print(type(exc_test), exc_test.shape)

# TF-IDF Weighted W2V

In [None]:
# model = TfidfVectorizer()
# tf_idf_train_matrix = model.fit_transform(exc_train)
# tf_idf_test_matrix = model.transform(exc_test)

# # We are converting a dictionary with word as a key, and the idf as a value
# dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))
# tfidf_feat = model.get_feature_names()

In [None]:
# tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
# row=0;
# for sent in tqdm(list_of_exc_train): # for each review/sentence 
#     sent_vec = np.zeros(300) # as word vectors are of zero length
#     weight_sum =0; # num of words with a valid vector in the sentence/review
#     for word in sent: # for each word in a review/sentence
#         if word in w2v_words and word in tfidf_feat:
#             vec = w2v_model.wv[word]
#             tf_idf = tf_idf_train_matrix[row, tfidf_feat.index(word)]
#             # To reduce the computation, we can use the following
#             # dictionary[word] = idf value of word in whole courpus
#             # sent.count(word) = tf valeus of word in this review
#             # tf_idf = dictionary[word]*(sent.count(word)/len(sent))
#             sent_vec += (vec * tf_idf)
#             weight_sum += tf_idf
#     if weight_sum != 0: sent_vec /= weight_sum
#     tfidf_sent_vectors.append(sent_vec)
#     row += 1

# exc_train = sparse.csr_matrix(tfidf_sent_vectors).toarray()
# print(type(exc_train), exc_train.shape)

In [None]:
# tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
# row=0;
# for sent in tqdm(list_of_exc_test): # for each review/sentence 
#     sent_vec = np.zeros(300) # as word vectors are of zero length
#     weight_sum =0; # num of words with a valid vector in the sentence/review
#     for word in sent: # for each word in a review/sentence
#         if word in w2v_words and word in tfidf_feat:
#             vec = w2v_model.wv[word]
#             tf_idf = tf_idf_test_matrix[row, tfidf_feat.index(word)]
#             # To reduce the computation, we can use the following
#             # dictionary[word] = idf value of word in whole courpus
#             # sent.count(word) = tf valeus of word in this review
#             # tf_idf = dictionary[word]*(sent.count(word)/len(sent))
#             sent_vec += (vec * tf_idf)
#             weight_sum += tf_idf
#     if weight_sum != 0: sent_vec /= weight_sum
#     tfidf_sent_vectors.append(sent_vec)
#     row += 1

# exc_test = sparse.csr_matrix(tfidf_sent_vectors).toarray()
# print(type(exc_test), exc_test.shape)

# Applying Linear Regression

In [None]:
# # Binary Bag of Words = 0.72
# lr = LinearRegression(normalize=True)
# lr.fit(exc_train, y_train)
# y_pred = lr.predict(exc_test)
# error = np.sqrt(mean_squared_error(y_test, y_pred))
# print(error)

In [None]:
# # Binary Bag of Words = 1.462
# exc_train_dense = exc_train.todense()
# exc_test_dense = exc_test.todense()
# lr = LinearRegression(normalize=True, fit_intercept=False, positive=True)
# lr.fit(exc_train_dense, y_train)
# y_pred = lr.predict(exc_test_dense)
# error = np.sqrt(mean_squared_error(y_test, y_pred))
# print(error)

# Applying AdaBoost Regressor

In [None]:
# # Binary Bag of Words = 0.94
# abr = AdaBoostRegressor(n_estimators=100, learning_rate=0.025, loss='square')
# abr.fit(exc_train, y_train)
# y_pred = abr.predict(exc_test)
# error = np.sqrt(mean_squared_error(y_test, y_pred))
# print(error)

# Applying Bagging Regressor

In [None]:
# # Binary Bag of Words = 0.873
# from sklearn.ensemble import BaggingRegressor
# br = BaggingRegressor()
# br.fit(exc_train, y_train)
# y_pred = br.predict(exc_test)
# error = np.sqrt(mean_squared_error(y_test, y_pred))
# print(error)

# Applying Extra Trees Regressor

In [None]:
# # Binary Bag of Words = 1.1583
# from sklearn.ensemble import ExtraTreesRegressor
# etr = ExtraTreesRegressor()
# etr.fit(exc_train, y_train)
# y_pred = etr.predict(exc_test)
# error = np.sqrt(mean_squared_error(y_test, y_pred))
# print(error)

# Applying Gradient Boosting Regressor

In [None]:
# # Binary Bag of Words = 0.806
# from sklearn.ensemble import GradientBoostingRegressor
# gbr = GradientBoostingRegressor()
# gbr.fit(exc_train, y_train)
# y_pred = gbr.predict(exc_test)
# error = np.sqrt(mean_squared_error(y_test, y_pred))
# print(error)

# Applying Random Forest Regressor

In [None]:
# # Binary Bag of Words = 0.813
# from sklearn.ensemble import RandomForestRegressor
# rfr = RandomForestRegressor()
# rfr.fit(exc_train, y_train)
# y_pred = rfr.predict(exc_test)
# error = np.sqrt(mean_squared_error(y_test, y_pred))
# print(error)

# Applying Histogram Gradient Boosting Regressor

In [None]:
# # Binary Bag of Words = 0.785
# from sklearn.experimental import enable_hist_gradient_boosting
# from sklearn.ensemble import HistGradientBoostingRegressor
# exc_train_dense = exc_train.todense()
# exc_test_dense = exc_test.todense()
# hgbr = HistGradientBoostingRegressor()
# hgbr.fit(exc_train_dense, y_train)
# y_pred = hgbr.predict(exc_test_dense)
# error = np.sqrt(mean_squared_error(y_test, y_pred))
# print(error)

# Applying RandomizedSearchCV

In [None]:
# parameters = {
#     'n_estimators': [25, 50, 75, 100],
#     'learning_rate': [0.001, 0.01, 0.1, 1, 5],
#     'loss': ['linear', 'square', 'exponential']
# }
# abr = AdaBoostRegressor()
# sco = make_scorer(mean_squared_error)
# reg = RandomizedSearchCV(abr, parameters, scoring = sco)
# reg.fit(exc_train, y_train)
# print(reg.best_estimator_)

# Applying Lazy Predict

In [None]:
# pip install lazypredict

In [None]:
# from lazypredict.Supervised import LazyRegressor

# reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
# models, predictions = reg.fit(exc_train, exc_test, y_train, y_test)
# print(models)