# Data Extraction
Get the training and testing dataset.

In [None]:
import pandas as pd
import numpy as np
import os
import sklearn

import sys
sys.path = [
    '../input/readability-package',
] + sys.path
import readability    

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, pos_tag_sents

from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, BayesianRidge
import spacy
import pickle
import joblib

import warnings
warnings.filterwarnings('ignore')

print("Python Version: ", sys.version)
print("Spacy Version: ", spacy.__version__)
print("SkLearn Version: ", sklearn.__version__)
print("NLTK Version: ", nltk.__version__)
print("Pandas Version: ", pd.__version__)
print("Numpy Version: ", np.__version__)


In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

# Data Preparation
Clean the data and get them ready for model training.

In [None]:
# source: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline/notebook

"""
This function uses the readability library for feature engineering.
It includes textual statistics, readability scales and metric, and some pos stats
"""
def readability_measurements(passage: str):
    results = readability.getmeasures(passage, lang='en')
    
    chars_per_word = results['sentence info']['characters_per_word']
    syll_per_word = results['sentence info']['syll_per_word']
    words_per_sent = results['sentence info']['words_per_sentence']
    
    kincaid = results['readability grades']['Kincaid']
    ari = results['readability grades']['ARI']
    coleman_liau = results['readability grades']['Coleman-Liau']
    flesch = results['readability grades']['FleschReadingEase']
    gunning_fog = results['readability grades']['GunningFogIndex']
    lix = results['readability grades']['LIX']
    smog = results['readability grades']['SMOGIndex']
    rix = results['readability grades']['RIX']
    dale_chall = results['readability grades']['DaleChallIndex']
    
    tobeverb = results['word usage']['tobeverb']
    auxverb = results['word usage']['auxverb']
    conjunction = results['word usage']['conjunction']
    pronoun = results['word usage']['pronoun']
    preposition = results['word usage']['preposition']
    nominalization = results['word usage']['nominalization']
    
    pronoun_b = results['sentence beginnings']['pronoun']
    interrogative = results['sentence beginnings']['interrogative']
    article = results['sentence beginnings']['article']
    subordination = results['sentence beginnings']['subordination']
    conjunction_b = results['sentence beginnings']['conjunction']
    preposition_b = results['sentence beginnings']['preposition']

    
    return [chars_per_word, syll_per_word, words_per_sent,
            kincaid, ari, coleman_liau, flesch, gunning_fog, lix, smog, rix, dale_chall,
            tobeverb, auxverb, conjunction, pronoun, preposition, nominalization,
            pronoun_b, interrogative, article, subordination, conjunction_b, preposition_b]

In [None]:
# source: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline/notebook

"""
This function generates features using spacy en_core_wb_lg
Useful resources:
https://www.kaggle.com/konradb/linear-baseline-with-cv
https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners
"""

def spacy_features(df: pd.DataFrame):  
    nlp = spacy.load('en_core_web_lg')
    with nlp.disable_pipes():
        vectors = np.array([nlp(text).vector for text in df.excerpt])
        
    return vectors

def get_spacy_col_names():
    names = list()
    for i in range(300):
        names.append(f"spacy_{i}")
        
    return names

In [None]:
# source: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline/notebook

def pos_tag_features(passage: str):
    """
    This function counts the number of times different parts of speech occur in an excerpt
    """
    pos_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]
    
    tags = pos_tag(word_tokenize(passage))
    tag_list= list()
    
    for tag in pos_tags:
        tag_list.append(len([i[0] for i in tags if i[1] == tag]))
    
    return tag_list

In [None]:
# source: https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline/notebook

"""
This function is where I test miscellaneous features
This is experimental
Currently checks sentence status
"""
def generate_other_features(passage: str):
    # punctuation count
    periods = passage.count(".")
    commas = passage.count(",")
    semis = passage.count(";")
    exclaims = passage.count("!")
    questions = passage.count("?")
    
    # Some other stats
    num_char = len(passage)
    num_words = len(passage.split(" "))
    unique_words = len(set(passage.split(" ") ))
    word_diversity = unique_words/num_words
    
    word_len = [len(w) for w in passage.split(" ")]
    longest_word = np.max(word_len)
    avg_len_word = np.mean(word_len)
    
    return [periods, commas, semis, exclaims, questions,
            num_char, num_words, unique_words, word_diversity,
            longest_word, avg_len_word]

In [None]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

# Feature Check
Checking for statistics for making the body of said book/topics.

In [None]:
'''
Source for feature check section: 
1.) https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline/notebook
'''

class CLRDataset:
    """
    This is my CommonLit Readability Dataset.
    By calling the get_df method on an object of this class,
    you will have a fully feature engineered dataframe
    """
    def __init__(self, df: pd.DataFrame, train: bool, n_folds=2):
        self.df = df
        self.excerpts = df["excerpt"]
        
        self._extract_features()
        
        if train:
            self.df = create_folds(self.df, n_folds)
        
    def _extract_features(self):
        scores_df = pd.DataFrame(self.df["excerpt"].apply(lambda p : readability_measurements(p)).tolist(), 
                                 columns=["chars_per_word", "syll_per_word", "words_per_sent",
                                          "kincaid", "ari", "coleman_liau", "flesch", "gunning_fog", "lix", "smog", "rix", "dale_chall",
                                          "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization",
                                          "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b"])
        self.df = pd.merge(self.df, scores_df, left_index=True, right_index=True)
        
        spacy_df = pd.DataFrame(spacy_features(self.df), columns=get_spacy_col_names())
        self.df = pd.merge(self.df, spacy_df, left_index=True, right_index=True)
        
        pos_df = pd.DataFrame(self.df["excerpt"].apply(lambda p : pos_tag_features(p)).tolist(),
                              columns=["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                                       "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                                       "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"])
        self.df = pd.merge(self.df, pos_df, left_index=True, right_index=True)
        
        other_df = pd.DataFrame(self.df["excerpt"].apply(lambda p : generate_other_features(p)).tolist(),
                                columns=["periods", "commas", "semis", "exclaims", "questions",
                                         "num_char", "num_words", "unique_words", "word_diversity",
                                         "longest_word", "avg_len_word"])
        self.df = pd.merge(self.df, other_df, left_index=True, right_index=True)
        
    def get_df(self):
        return self.df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        pass

In [None]:
# Results for training dataset.
dataset = CLRDataset(train_df, train=True)
df = dataset.get_df()
df.head()

In [None]:
# Results for testing dataset.
test_dataset = CLRDataset(test_df, train=False)
test_df = test_dataset.get_df()
test_df.head(2)

# Modelling
Make the model and train it.

In [None]:
def set_seed(seed=42):
    """ Sets the Seed """
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
set_seed(42)

In [None]:
features = ["chars_per_word", "syll_per_word", "words_per_sent",
            "kincaid", "ari", "coleman_liau", "flesch", "gunning_fog", "lix", "smog", "rix", "dale_chall",
            "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization", 
            "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b"]
features+=get_spacy_col_names()
features+=["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
            "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
            "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]

In [None]:
""" I normalize the data here, could be useful depending on your model"""
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])
test_df[features] = scaler.transform(test_df[features])

# Training
Define the model to train.

In [None]:
def train_pred_one_fold(model_name: str, fold: int, df: pd.DataFrame, test_df: pd.DataFrame, features: list, rmse: list):
    """
    This function trains and predicts on one fold of your selected model
    df is the train df, test_df is the test_df
    X features are defined in features
    y output is target
    oof score is printed and stored in the rmse list
    """
    train = df[df.kfold == fold]
    X_train = train[features]
    y_train = train["target"]
 
    valid = df[df.kfold != fold]
    X_valid = valid[features]
    y_valid = valid["target"]
    
    X_test = test_df[features]

    # Ridge model
    if model_name == 'ridge' or model_name == 'bayesian_ridge':
        model.fit(X_train, y_train)
        oof = model.predict(X_valid)
        print(np.sqrt(mean_squared_error(y_valid, oof)))
        rmse.append(np.sqrt(mean_squared_error(y_valid, oof)))
        test_preds = model.predict(X_test)
#         with open(f"model_{fold}.pkl", "wb") as file:
#             pickle.dump(model, file)
        if not os.path.isfile(f"model_{fold}.joblib"):
            joblib.dump(model, f"model_{fold}.joblib")
    
    else:
        test_preds = 0
        raise Exception("Not Implemented")
        
    return test_preds

In [None]:
def train_pred(model_name: str, df: pd.DataFrame, test_df: pd.DataFrame, features: list):
    """
    This function trains and predicts multiple fold using train_pred_one_fold
    The average rmse is printed the the test data predictions are returned
    The last column is the average result from all folds to be submitted
    """
    global model
    if model_name == 'ridge':
        model = Ridge(alpha=3, max_iter=10000)
        
    elif model_name == 'bayesian_ridge':
        model = BayesianRidge(n_iter=10000, tol=0.8) 
        
    print(f"model_name: {model_name}")
    all_preds = pd.DataFrame()
    rmse = list()
    for f in range(2):
        all_preds[f"{model_name}_{f}"] = train_pred_one_fold(model_name, f, df, test_df, features, rmse)

    all_preds[f"{model_name}"] = all_preds.mean(axis=1)
    print("---------")
    print(f"avg rmse: {np.mean(rmse)}")
    return all_preds

In [None]:
def prep_sub(preds: pd.DataFrame, col_name: str):
    """
    This function takes an output prediction df from train_pred
    and sets it to a format that can be submitted to the competition
    """
    sub = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
    sub["target"] = preds[col_name]
    sub.to_csv("submission.csv", index=False)
    print(sub)



In [None]:
ridge_preds = train_pred('bayesian_ridge', df, test_df, features)
ridge_preds

In [None]:
prep_sub(ridge_preds, 'bayesian_ridge')


# Data Scraping
Get at least the top 100 results from a book site called Project Gutenberg.

Since the site did not recommend scraping, I can do the cheeky and only get from this link: https://www.gutenberg.org/browse/scores/top#books-last1

Sorry MSPs but I need to enable internet for this one.

In [None]:
# For web scraping, charrrrgeeeee !!!!!

import requests
from bs4 import BeautifulSoup as bs # Totally not BS
import re
import pandas as pd
import numpy as np
import itertools

In [None]:
the_only_url_worth_scraping = "https://www.gutenberg.org/browse/scores/top#books-last1"
response = requests.get(the_only_url_worth_scraping)
book_soup_is_delicious = bs(response.text, 'html.parser')

In [None]:
# Extract book urls
books_list = book_soup_is_delicious.ol.find_all('a', attrs={'class': None})
books_list = [tag.attrs['href'] for tag in books_list 
              if tag.attrs['href'].startswith('/ebooks/')]
books_lists = list(dict.fromkeys(books_list))

# print("In total we have " + str(len(books_list)) + " books") # Comment out afterwards
# print(books_list) # Comment out afterwards

In [None]:
# Make the test dataset. This gonna take long af.
j = 0
title_dict = {}

csv_data = pd.DataFrame([], columns=['id', 'url_legal', 'license', 'excerpt'])
for book in books_list:
    book_id = book[8:]

    # HTTP link constructor because main site does not like being scraped so I used mirror.
    url_html_book = "https://gutenberg.pglaf.org/"
    if len(book_id) == 1:
        url_html_book += "0/"
    else:
        for i in range(len(book_id) - 1):
            url_html_book += book_id[i] + "/"
    url_html_book += book_id + "/" + book_id + "-h/" + book_id + "-h.htm"
    
    # Get all the paragraphs.
    book_response = requests.get(url_html_book, headers={"Accept":"text/html;charset=utf-8"})
    single_book = bs(book_response.text, 'html.parser')
    single_book_paragraphs = single_book.find_all('p', attrs={'class': None})
    single_book_title = single_book.title.get_text().strip()
    
    if(single_book_title.lower().startswith("the project gutenberg ebook of ")):
        title_dict.update({book_id: single_book_title[31:]})
    else:
        title_dict.update({book_id: single_book_title})

    paragraphs = ""
    
    # Process each paragraph.
    for paragraph in single_book_paragraphs:
        text = paragraph.get_text().strip()
        
        short_text = text[:12]
        short_text_lower = short_text.lower()
        short_text_upper = short_text.upper()
        
        # Remove "table of contents" parts.
        excluded_start = short_text_lower.startswith('chapter') or short_text_lower.startswith("drawn by")
        
        # Remove couple of multiple capital letters.
        excluded_start = excluded_start or short_text_upper.startswith(short_text)
        
        # Filter off licensing text.
        excluded_start = excluded_start or short_text_lower.startswith("copyright")
        excluded_start = excluded_start or short_text_lower.startswith("gnu free")
        excluded_start = excluded_start or short_text_lower.startswith("note:")
        excluded_start = excluded_start or text.find("Project Gutenberg eBook") >= 0
        excluded_start = excluded_start or short_text_lower.startswith("produced by:")
        
        # Ignore empty paragraphs and short ones. Also do not include chapter listings.
        if len(text) > 50 and not excluded_start:

            # Clean the paragraph.
            text = text.replace('  ', '').replace('\n', ' ').replace('\r', '').strip() + " \n"
            paragraphs += text
            
            if(len(paragraphs.split(' ')) >= 175):
                paragraphs = ' '.join(paragraphs.split(' ')[:175])
                break
    
    if(len(paragraphs.split(' ')) >= 75):
        csv_form = pd.DataFrame([[book_id, url_html_book, "Public domain in the USA." , 
                                  paragraphs]], columns=['id', 'url_legal', 'license', 'excerpt'], index=[j])
        csv_data = csv_data.append(csv_form)
    j = j + 1
    print("Progress: {} of {}".format(j, len(books_list))) 

csv_data.to_csv("guthenberg.csv", index=True) # Raw data to be used for processing.

In [None]:
# Results for guthenberg dataset.
# But for some reason the pre-processing disliked a few of the excerpts so can result in less data processed.
test_dataset = CLRDataset(csv_data, train=False) 
test_df = test_dataset.get_df()
test_df.fillna(0) # Ensure all NaNs or empty values are 0 since model only processes numbers.
# test_df.to_csv("sample_gutenberg_dataset.csv") 
test_df.head()

In [None]:
# Prepare to use model actually.
test_df[features] = scaler.transform(test_df[features])
ridge_preds = train_pred('bayesian_ridge', df, test_df, features)

# Temp save bayesian ridge results.
# ridge_preds.to_csv("model_results.csv", columns = ["bayesian_ridge"], index=False)
ridge_preds

In [None]:
# Make a list that matches submission format.
guthenb_list = []
ridge_list = ridge_preds["bayesian_ridge"].to_list()
test_ids = test_df["id"].to_list()
for i in range(len(ridge_list)):
    guthenb_list.append([test_ids[i], ridge_list[i], title_dict[test_ids[i]]])
# print(guthenb_list)
guthenb = pd.DataFrame(guthenb_list, columns=["id", "target", "title"])
guthenb.to_csv("guthenberg-results.csv", index=False)