Spacy features taken from https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners - please upvote!

Current score (LB: 0.624) is version 4

In [None]:
import numpy as np 
import pandas as pd 
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import spacy

In [None]:
# config params
class CFG:
    nfolds =  10
    

# Functions

In [None]:
# count syllables: https://stackoverflow.com/questions/46759492/syllable-count-in-python
def syllable_count(word):
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("e"):
                count -= 1
    if count == 0:
        count += 1
    return count

# Data and FE

In [None]:
xtrain = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
xtest = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

## summary statistics features


In [None]:
# Flesch score: https://blog.ung.edu/press/measure-readability/

# count the characters
xtrain['nof_char'] = xtrain['excerpt'].apply(len)
xtest['nof_char'] = xtest['excerpt'].apply(len)

# count the words
xtrain['nof_words'] = xtrain['excerpt'].apply(lambda s: len(s.split(' ')))
xtest['nof_words'] = xtest['excerpt'].apply(lambda s: len(s.split(' ')))

# words to characters
xtrain['w2c'] = xtrain['nof_words'] / xtrain['nof_char']
xtest['w2c'] = xtest['nof_words'] / xtest['nof_char']

# nof sentences
xtrain['nof_sentences'] =  xtrain['excerpt'].apply(lambda s: s.count('.'))
xtest['nof_sentences'] =  xtest['excerpt'].apply(lambda s: s.count('.'))

# nof syllables
xtrain['nof_syllables'] =  xtrain['excerpt'].apply(lambda s: syllable_count(s))
xtest['nof_syllables'] =  xtest['excerpt'].apply(lambda s: syllable_count(s))

# Fleisch score
a = 206.835 - 1.015 * (xtrain['nof_words'] / xtrain['nof_sentences'])
b = -84.6 * (xtrain['nof_syllables'] / xtrain['nof_words'])
xtrain['fleisch_score'] = a + b

a = 206.835 - 1.015 * (xtest['nof_words'] / xtest['nof_sentences'])
b = -84.6 * (xtest['nof_syllables'] / xtest['nof_words'])
xtest['fleisch_score'] = a + b

# Fleisch score 2
a = (xtrain['nof_words'] / xtrain['nof_sentences'])
b = (xtrain['nof_syllables'] / xtrain['nof_words'])
xtrain['fleisch_score2'] = 0.39 * a + 11.8 * b - 15.59

a = (xtest['nof_words'] / xtest['nof_sentences'])
b = (xtest['nof_syllables'] / xtest['nof_words'])
xtest['fleisch_score2'] = 0.39 * a + 11.8 * b - 15.59
 
    
del a,b


# count the unique words
xtrain['nof_unique_words'] = xtrain['excerpt'].apply(lambda s: len(set( s.split(' ') )))
xtest['nof_unique_words'] = xtest['excerpt'].apply(lambda s: len(set( s.split(' ') )))

# text diversity
xtrain['txt_diversity'] = xtrain['nof_unique_words'] / xtrain['nof_words']
xtest['txt_diversity'] = xtest['nof_unique_words'] / xtest['nof_words']

# word lengths
words = xtrain['excerpt'].apply(lambda s: s.split(' '))
word_lengths = words.apply(lambda s: [len(f) for f in s ])
xtrain['longest_word'] = word_lengths.apply(max)
xtrain['avg_word'] = word_lengths.apply(np.mean)

words = xtest['excerpt'].apply(lambda s: s.split(' '))
word_lengths = words.apply(lambda s: [len(f) for f in s ])
xtest['longest_word'] = word_lengths.apply(max)
xtest['avg_word'] = word_lengths.apply(np.mean)


## Spacy features

In [None]:
# Taken from: https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners

nlp = spacy.load('en_core_web_lg')
with nlp.disable_pipes():
    train_vectors = np.array([nlp(text).vector for text in xtrain.excerpt])
    test_vectors = np.array([nlp(text).vector for text in xtest.excerpt])
        
namelist = ['f' + str(ii) for ii in range(train_vectors.shape[1])]

train_vectors = pd.DataFrame(train_vectors)
test_vectors = pd.DataFrame(test_vectors)
train_vectors.columns = namelist
test_vectors.columns = namelist


## Combined

In [None]:
xtrain = pd.concat([xtrain, train_vectors], axis = 1)
xtest = pd.concat([xtest, test_vectors], axis = 1)


features = ['nof_words', 'nof_sentences', 'nof_syllables', 'fleisch_score',
           'txt_diversity', 'nof_unique_words', 'nof_char', 'w2c', 
            'fleisch_score2'] + namelist
features

# CV 

In [None]:
kf = KFold(n_splits = CFG.nfolds)

prval = np.zeros((xtrain.shape[0],1))
prfull = np.zeros((xtest.shape[0],1))

In [None]:
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline

In [None]:
models={"LR":LinearRegression(),
        "rdg":Ridge(),
        "rf":RandomForestRegressor(n_estimators=100),
        "Adb":AdaBoostRegressor(n_estimators=120)
        }

In [None]:
for m in models:
    model = models[m]
    for id0, id1 in kf.split(xtrain):
        x0, x1 = xtrain[features].loc[id0], xtrain[features].loc[id1]
        y0, y1 = xtrain['target'][id0], xtrain['target'][id1]
        model.fit(x0,y0)

        ypred = model.predict(x1)
        prval[id1,0] =  model.predict(x1)
        prfull[:,0] += model.predict(xtest[features])/CFG.nfolds

        print(np.round( np.sqrt(mse(prval[id1,0], y1)),2 ))

    # score
    print('--'+m)
    print(np.round( np.sqrt(mse(prval, xtrain['target'])) , 3))
    print('/n')

# Submission

In [None]:
xsub = xtest[["id"]].copy()
xsub["target"] = prfull
xsub.to_csv('submission.csv', index = False)