Block below starts project up with needed files, libraries, and miscellaneous settings.

In [None]:
#import project libraries
import string
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import StandardScaler
import spacy
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time

#import redabillity
#thanks to Ravi Shah for the tips
import sys
sys.path = [
    '../input/readability-package',
] + sys.path
import readability

#supress SettingwithCopy warning
pd.options.mode.chained_assignment = None

#set seed
np.random.seed(31415)

Raw data files are brought in, and sliced for relevant information. 

In [None]:
#bring in raw data
raw_train = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
raw_test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

#split into X/y
X = pd.DataFrame(raw_train["excerpt"], columns = ["excerpt"])
y = pd.DataFrame(raw_train["target"], columns = ["target"])

Create a function to bind column features to each excerpt.

In [None]:
def add_read_meas(df):

    t1 = time.time()
    
    #thanks to Ravi Shah for the tips
    #create df of readability by row in training data
    read_meas = [readability.getmeasures(e, lang = "en") for e in df["excerpt"]]
    read_meas = pd.DataFrame(read_meas)
    
    #bind readability features to df
    r_list = ["readability grades", "sentence info", "word usage", "sentence beginnings"]
    for item in r_list:
        df = df.join(pd.json_normalize(read_meas[item]), lsuffix = "_tot")
    
    #create word type ratios from nominals
    w_list = ["wordtypes", "long_words", "complex_words", "complex_words_dc", "tobeverb", "auxverb", "conjunction_tot", "pronoun_tot", "preposition_tot", "nominalization"]
    df[w_list] = df[w_list].div(df.words, axis = 0)
    
    #create per sentence ratios from nominals
    b_list = ["pronoun", "interrogative", "article", "subordination", "conjunction", "preposition"]
    df[b_list] = df[b_list].div(df.sentences, axis = 0)
    df = df.drop(["words", "characters", "syllables", "wordtypes", "sentences", "paragraphs"], axis = 1)
    
    t2 = time.time()
    
    print(f"readability measures: {t2 - t1}")
    
    #thanks to Ravi Shah for the tips
    ###################
    
    t3 = time.time()
    
    #create vectored words array
    
    #first step: tokenize, lemmatize, remove stop words
    
    #copy text
    spacy_df = df["excerpt"].copy()

    #set lang
    nlp = spacy.load('en_core_web_lg')

    #set to lowercase
    spacy_df = [e.lower() for e in spacy_df]

    #remove punctuation
    spacy_df = [e.translate(str.maketrans('', '', string.punctuation)) for e in spacy_df]

    #bring text into df
    spacy_df = pd.DataFrame(spacy_df, columns = ["excerpt"])

    #tokenize
    tokenizer = nlp.tokenizer
    spacy_df["excerpt"] = spacy_df["excerpt"].apply(lambda row: list(tokenizer(row)))

    #remove stop words
    stop_words = set(stopwords.words('english')).union(nlp.Defaults.stop_words)
    spacy_df["excerpt"] = spacy_df["excerpt"].apply(lambda row: [str(token) for token in row if str(token) not in stop_words])
    
    #bring rows back to string
    spacy_df["excerpt"] = spacy_df["excerpt"].str.join(" ")
    
    #second step: vectorize and bind
    with nlp.disable_pipes():
        vectors = np.array([nlp(text).vector for text in spacy_df.excerpt])
    
    #create column list
    names = list()
    for i in range(300):
        names.append(f"spacy_{i}")
    
    #create spacy df
    spacy_cols = pd.DataFrame(vectors, columns = names)
    
    #add spacy features
    df = df.join(spacy_cols)
    
    t4 = time.time()
    
    print(f"spacy measures: {t4 - t3}")
    
    #thanks to Ravi Shah for the tips
    ###################
    
    t5 = time.time()
    
    #parts of speach tag list and df
    pos_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH", "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]
    pos = pd.DataFrame(columns = pos_tags)
    
    #loop through the rows and add counts for each pos
    for e in df["excerpt"]:
        
        #break excerpt into list of tokens
        tags = pos_tag(word_tokenize(e))
        
        #instantiate/clear row dict
        tag_row = dict()
        
        #iterate through tags and set to 0
        for p in pos_tags:
            tag_row[p] = 0

        #update counts for each word
        for tag in tags:
            try:
                tag_row[tag[1]] += 1
            except:
                pass
        
        #add row to pos
        pos = pos.append(tag_row, ignore_index = True)
    
    #create ratio of tags
    pos = pos/pos.sum(axis=1)[:,None]
    
    #add pos features
    df = df.join(pos)
    
    t6 = time.time()
    
    print(f"pos measures: {t6 - t5}")
    
    return df   

Bind features to training data and view.

In [None]:
#add read meas
X = add_read_meas(X)

#drop excerpt
X = X.drop("excerpt", axis = 1)

#show data
print(X.head())

#show data shape
print(X.shape)

#scale data
sc = StandardScaler().fit(X)
X = sc.transform(X)

Create the elastic net model function.

In [None]:
#create elastic net CV function
def ENCV(X, y, folds=10):
    
    t1 = time.time()
    
    #cv arg
    cv = RepeatedKFold(n_splits = folds, n_repeats = 3, random_state = 684)
    
    #define possible penalty values
    l_ones = np.arange(.01, 1, .01)
    
    #instantiate model
    model = ElasticNetCV(l1_ratio = l_ones,
                        alphas = None,
                        cv = cv,
                        n_jobs = -1,
                        normalize = False,
                        fit_intercept = True,
                        tol = .1)
    
    #fit model
    fitted_model = model.fit(X, y)
    
    #get predictios, calcualte RMSE
    pred = fitted_model.predict(X)
    rmse = mean_squared_error(y, pred, squared = False)
    
    t2 = time.time()
    
    print(f"elastic net run time: {t2 - t1}")
    
    #return model and RMSE
    return fitted_model, rmse

Model is run and output and RMSE is saved.

In [None]:
#run model and save data
model, score = ENCV(X, y)

#show parameters and RMSE score
print(model.alpha_, model.l1_ratio_)
print(score)

RMSE is not great, but I am proud of it.


Test data is run and submitted below.

In [None]:
#prepare test features
test = pd.DataFrame(raw_test["excerpt"], columns = ["excerpt"])
test = add_read_meas(test)
test = test.drop("excerpt", axis = 1)
test = sc.transform(test)

#build submission
data = [raw_test["id"], pd.Series(model.predict(test))]
headers = ["id", "target"]
submission = pd.concat(data, axis=1, keys = headers)

#show submission
print(submission)

#save submission
submission.to_csv('submission.csv', index = False)