In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Overview

In this competition, the goal is to develop a reading ease classification model for literature excerpts drawn from a wide range of time periods.

# Exploratory Data Analysis

## Imports

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

## train.csv

In [None]:
df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
RECORDS, COLUMNS = df.shape
print(f'Shape: {(RECORDS, COLUMNS)}')

train.csv has 6 columns and 2834 records. Lets see what the columns look like.

### train.csv feature information

In [None]:
df.head()

In [None]:
print('dtypes for each column:')
dtypes = {}
for column in df.columns:
    print(f'{column}: {str(df[column].dtype)}')

The 6 features are: id, url_legal, license, excerpt, target, and standard_error. Target and standard_error are floats, and all other features are strings.

In [None]:
null_count = df.isnull().sum().rename('null_count')
null_proportion_total = (null_count/RECORDS).rename('proportion_total')

pd.concat([null_count, null_proportion_total], axis=1)

There are numerous null values for url_legal and license, but those probably won't be useful in our model, so we don't need to worry about filling them.

In [None]:
unique_count = df.nunique().rename('unique_count')
proportion_non_null = (unique_count/(RECORDS-null_count)).rename('proportion_non_null')
proportion_total = (unique_count/RECORDS).rename('proportion_total')

pd.concat([unique_count, proportion_non_null, proportion_total], axis=1)

Every record has unique values for id (as expected), excerpt, target, and standard_error. Among the non-null records for url_legal and license, there are numerous duplicates, especially for license.

## Target and standard error distributions

In [None]:
df[['target', 'standard_error']].describe()

In [None]:
fig, ax = plt.subplots(
    2,
    1,
    figsize=(10, 6),
)
for i, column in enumerate(['target', 'standard_error']):
    plot = sns.boxenplot(
        data=df, 
        x=column,
        ax=ax[i],
        linewidth=1,
        width=.5,
        palette=sns.color_palette('deep')[:1],
    )
    plot.set_xlabel(column, fontsize=14)
    if i > 0:
        plot.set(ylabel=None)
fig.suptitle('Distributions of target and standard_error', fontsize=16)
fig.tight_layout()

target ranges from -3.677 to 1.711. standard_error ranges from 0 to 0.650, although it should be noted that the minimum is a significant outlier. Both target and standard_error contain outliers.

In [None]:
fig, ax = plt.subplots(
    1,
    1,
    figsize=(6, 6),
)
plot = sns.scatterplot(
    data=df, 
    x='target',
    y='standard_error',
    linewidth=0,
    color=sns.color_palette('deep')[0],
    alpha=.5,
)
plt.ylim(.4, None)
fig.suptitle('Scatterplot of target v. standard_error', fontsize=16)
fig.tight_layout()

Generally, the more extreme values of target have a larger standard error. The above scatterplot has a y minimum of 0.4 to more clearly show the trend, but doing so removes the clear standard_error outlier at 0.

# Pre-processing Excerpt

## Imports

In [None]:
from collections import Counter
import string
from nltk.tokenize import sent_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

## Special characters

In [None]:
lowercase = [c for c in string.ascii_lowercase]
digits = [c for c in string.digits]
punc = [c for c in string.punctuation]
extra = [' ', '\n', '—', '–']
lowercase_digits_punc = lowercase + digits + punc + extra

character_counter = Counter([character.lower() for excerpt in df['excerpt'] for character in excerpt])
characters_df = pd.DataFrame(sorted(character_counter.items(), key=lambda x: -x[1]))
characters_df.columns = ['character', 'count']

special_characters_df = pd.DataFrame(characters_df.loc[~characters_df['character'].isin(lowercase_digits_punc)])

total_character_count = characters_df['count'].sum()
print(f'Total number of characters: {total_character_count}')

In [None]:
fig = plt.figure(figsize=(12,6))
plot = sns.barplot(
    data=special_characters_df,
    x='character',
    y='count',
    color=sns.color_palette('deep')[0],
)
plot.set_xticklabels(plot.get_xticklabels(), size=14)
plot.set_title('Barplot of special character counts', fontsize=16)

We can see that there are instances of the less common characters, but they are relatively rare. There are also special characters, such as accented characters, which likely appear in the middle of words.

## Sentence cleaning function

In [None]:
def clean(sentence, remove_stopwords=True, lemmatize=True):
    """
    Takes a raw sentence and processes it into a list of lowercase, lemmatized words without stopwords.
    
    Keyword arguments:
        sentence: the raw sentence as a string
        remove_stopwords: True if stopwords should be removed from the sentence, false otherwise
        lemmatize: True if the sentence tokens should be lemmatized, false otherwise
    
    Returns:
        sentence (list): the cleaned list of tokens in the sentence
    """
    
    # Transform to lowercase
    sentence = sentence.lower()
    
    # Separate tokens
    # This method is fast, but will split words containing accented characters and other special characters may be split strangely
    sentence = regexp_tokenize(sentence, pattern='\w+')

    # Remove stopwords
    if remove_stopwords:
        sentence = [word for word in sentence if word not in stopwords.words('english')]

    # Lemmatize
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        sentence = [lemmatizer.lemmatize(word) for word in sentence]
    
    return sentence

## Example with the first excerpt

Before pre-processing:

In [None]:
print(df['excerpt'][0])

After pre-processing, with sentences separated by '.' and tokens separated by ' ':

In [None]:
res = sent_tokenize(df['excerpt'][0])
res = [' '.join(clean(sentence)) for sentence in res]
res = ' . '.join(res)
print(res)

## Cleaning all excerpts

In [None]:
CLEANED_EXCERPTS = df['excerpt'].transform(lambda excerpt: [clean(sentence) for sentence in sent_tokenize(excerpt)])
CLEANED_EXCERPTS

# Feature Engineering

# Imports

In [None]:
from nltk.corpus import cmudict

from nltk import pos_tag as pos_tag

## Excerpt, sentence, word, and syllable metadata

In [None]:
meta_df = pd.DataFrame()

### Excerpt length

In [None]:
meta_df['char_count'] = CLEANED_EXCERPTS.transform(lambda excerpt: sum([len(word) for sentence in excerpt for word in sentence]))
meta_df['word_count'] = CLEANED_EXCERPTS.transform(lambda excerpt: len([word for sentence in excerpt for word in sentence]))
meta_df['sent_count'] = CLEANED_EXCERPTS.transform(len)

### Mean words per sentence

In [None]:
def mean_words_per_sent(excerpt):
    """
    Takes a cleaned excerpt and returns the mean number of words per sentence
    
    Keyword Parameters:
        excerpt: cleaned excerpt
        
    Returns:
        mean_len (float): the mean number of words in all sentences
    """
    sentence_lens = [len(sentence) for sentence in excerpt]
    return sum(sentence_lens)/float(len(sentence_lens))
meta_df['mean_words_per_sent'] = CLEANED_EXCERPTS.transform(mean_words_per_sent)

### Mean characters per word

In [None]:
def mean_chars_per_word(excerpt):
    """
    Takes a cleaned excerpt and returns the mean word length
    
    Keyword Parameters:
        excerpt: cleaned excerpt
        
    Returns:
        mean_len (float): the mean word length
    """
    word_lens = [len(word) for sentence in excerpt for word in sentence]
    return sum(word_lens)/float(len(word_lens))
meta_df['mean_chars_per_word'] = CLEANED_EXCERPTS.transform(mean_chars_per_word)

### Mean syllables per word

Credit to [Syllapy](https://github.com/mholtzscher/syllapy), Michael Holtzscher for syllabication.

In [None]:
# referred from datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word/24262
d = cmudict.dict()
def nsyl(word):
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word]][0]
    except KeyError:
        # if word not found in cmudict
        return syllables(word)
def syllables(word):
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return count

def mean_syllable_count_per_word(excerpt):
    """
    Takes a cleaned excerpt and returns the mean syllable count per word
    
    Keyword Parameters:
        excerpt: cleaned excerpt
        
    Returns:
        mean_len (float): the mean syllable count per word
    """
    # If 0 syllables are counted for a word, the word is likely jargon and shouldn't be counted
    syllable_counts = [nsyl(word) for sentence in excerpt for word in sentence if nsyl(word) > 0]
    return sum(syllable_counts)/float(len(syllable_counts))
meta_df['mean_syllables'] = CLEANED_EXCERPTS.transform(mean_syllable_count_per_word)

### Polysyllables count

In [None]:
def polysyllables(excerpt):
    polysyllables = [word for sentence in excerpt for word in sentence if nsyl(word) >= 3]
    return len(polysyllables)
meta_df['polysyllables'] = CLEANED_EXCERPTS.transform(polysyllables)

### Visualizations

In [None]:
meta_df.head()

In [None]:
fig, ax = plt.subplots(1, len(meta_df.columns), figsize=(16,6))
for i, column in enumerate(meta_df.columns):
    plot = sns.boxenplot(
        data=meta_df,
        y=column,
        ax=ax[i],
        linewidth=1,
        width=.5,
    )
    plot.set_ylabel(f'\n{column}', fontsize=14)
fig.suptitle('Boxenplots for excerpt metadata', fontsize=12)
fig.tight_layout()
fig.show()

Note that there are outliers across all the features, especially for mean words per sentence, with a few overly long sentences. Also, these features will need to be scaled before being used in any model, considering the variance in their ranges.

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(12,8), sharey=True)
for i, column in enumerate(meta_df.columns):
    plot = sns.scatterplot(
        data=meta_df,
        x=column,
        y=df['target'],
        ax=ax[i//3, i%3],
        linewidth=0,
        alpha=0.2,
    )
    plot.set_xlabel(f'{column}', fontsize=14)
    
ax[2,1].axis('off')
ax[2,2].axis('off')
fig.suptitle('Scatterplots for excerpt metadata against target', fontsize=16)
fig.tight_layout()
fig.show()

In [None]:
corr = pd.concat([meta_df, df['target']], axis=1).corr().drop('target', axis=1).drop(meta_df.columns, axis=0)
fig = plt.figure(figsize=(10,3))
plot = sns.heatmap(
    data=corr,
    vmin=-1,
    vmax=1,
    linewidth=2,
    annot=True,
    square=True,
)
plot.set_xlabel('Excerpt metadata', size=14)
plot.set_title('Excerpt metadata correlation with target', size=16)
fig.tight_layout()

Several of these features have a negative correlation with the target. `word_count` and `sentence_count` do not have as strong a correlation, which is reasonable given that overall excerpt length was probably somewhat engineered to be standard across all records.

In [None]:
fig = plt.figure(figsize=(6,6))
plot = sns.scatterplot(
    data=df,
    x=df['excerpt'].transform(lambda excerpt: len(regexp_tokenize(excerpt, pattern='\w+'))),
    y=df['target'],
    linewidth=0,
)
plot.set_xlabel('raw excerpt word count')
plot.set_title('Raw excerpt length v. target', size=16)
fig.show()

This figure supports the previous note, as the raw excerpt word count has no visual correlation with the target.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))
mask = np.triu(np.ones_like(meta_df.corr(), dtype=np.bool))
plot = sns.heatmap(
    data=meta_df.corr(),
    mask=mask,
    vmin=-1,
    vmax=1,
    annot=True,
    linewidth=2,
    square=True,
)
plot.set_title('Correlation heatmap of excerpt metadata', fontsize=16)

Notes:
* `char_count` is highly positively correlated with `word_count` and `polysyllables`.
* `mean_syllables`, `mean_chars_per_word`, and `polysyllables` are all highly positively correlated with each other.

## Parts of speech

Parts of speech tagging is best done without any lemmatization or removal of stop words, so excerpts from `CLEANED_EXCERPTS` shouldn't be used.

### Tagging function

In [None]:
def excerpt_pos_tag(excerpt):
    """
    Take a raw excerpt and return a list of sentences of parts of speech tags
    
    Keyword arguments:
        excerpt: the raw excerpt
        
    Returns:
        pos_tags (list): list of each sentence in the excerpt as a list of parts of speech tags
    """
    clean_curried = lambda sentence: clean(sentence, remove_stopwords=False, lemmatize=False)
    return [[tag[1] for tag in pos_tag(clean_curried(sentence))] for sentence in sent_tokenize(excerpt)]

### Example with the first excerpt

In [None]:
pos_tags = excerpt_pos_tag(df['excerpt'][0])
for sentence in pos_tags:
    print(sentence)

### Tag all excerpts

In [None]:
POS_TAGGED_EXCERPTS = df['excerpt'].transform(excerpt_pos_tag)
POS_TAGGED_EXCERPTS

### Tag counts

In [None]:
def tag_count(pos_tags):
    """
    Takes a list of tags separated into sentences, and returns a counter of all tags
    
    Keyword Arguments:
        pos_tags: a list of each sentence as a list of tags.
        
    Returns:
        tag_count_dict: a dict with counts for each tag
    """
    return dict(Counter([tag for sentence in pos_tags for tag in sentence]))

An example tag count with the first excerpt

In [None]:
tag_count(POS_TAGGED_EXCERPTS[0])

Now count tags for all excerpts, with a column for each tag.

In [None]:
POS_TAG_COUNTS = POS_TAGGED_EXCERPTS.transform(tag_count)
pos_tag_counts_df = POS_TAG_COUNTS.apply(pd.Series).fillna(0).astype(int)
pos_tag_counts_df.head()

In [None]:
non_zero_pos_tag_counts_df = pd.DataFrame((RECORDS - (pos_tag_counts_df == 0).astype(int).sum(axis=0)))
non_zero_pos_tag_counts_df = non_zero_pos_tag_counts_df.sort_values(0, axis=0, ascending=False)
non_zero_pos_tag_counts_df = non_zero_pos_tag_counts_df.reset_index()
non_zero_pos_tag_counts_df.columns = ['tag', 'non_zero_count']
fig = plt.figure(figsize=(16,6))
plot = sns.barplot(
    data=non_zero_pos_tag_counts_df,
    x= 'tag',
    y='non_zero_count',
    color=sns.color_palette('deep')[0],
)
for index, row in non_zero_pos_tag_counts_df.iterrows():
    plot.text(index,row.non_zero_count+20, row.non_zero_count, color='black', ha='center')
plot.set_xticklabels(plot.get_xticklabels(), size=14, rotation=45)
plot.set_title('Number of non zero counts for parts of speech tags', fontsize=16)
fig.show()

Some tags are overwhelmingly absent in most excerpts, and likely shouldn't be included as counts in the modelling. Also note the SYM is missing, a result of our tokenization process removing punctuation.

In [None]:
USEFUL_TAGS = non_zero_pos_tag_counts_df['tag'].head(25)

In [None]:
fig = plt.figure(figsize=(8,6))
mask = np.triu(np.ones_like(pos_tag_counts_df.corr(), dtype=np.bool))
plot = sns.heatmap(
    data=pos_tag_counts_df.corr(),
    mask=mask,
    vmin=-1,
    vmax=1,
    linewidth=0,
    square=True,
)
plot.set_title('Correlation heatmap of POS tag counts', size=16)
fig.tight_layout()
fig.show()

In [None]:
corr = pd.concat([meta_df, pos_tag_counts_df], axis=1).corr().drop(pos_tag_counts_df.columns, axis=0).drop(meta_df.columns, axis=1)
fig = plt.figure(figsize=(12,3))
plot = sns.heatmap(
    data=corr,
    vmin=-1,
    vmax=1,
    linewidth=0,
    square=True,
)
plot.set_xlabel('POS tag counts', size=14)
plot.set_ylabel('Excerpt metadata', size=14)
plot.set_title('Heatmap of POS tag count correlation with excerpt metadata', size=16)
fig.tight_layout()

In [None]:
corr = pd.concat([pos_tag_counts_df, df['target']], axis=1).corr().drop('target', axis=1).drop(pos_tag_counts_df.columns, axis=0)
fig = plt.figure(figsize=(12,2))
plot = sns.heatmap(
    data=corr,
    vmin=-1,
    vmax=1,
    linewidth=2,
    annot=True,
    annot_kws={'rotation': 90}
)
plot.set_xlabel('POS tag count', size=14)
plot.set_title('POS tag count correlation with target', size=16)
fig.tight_layout()

Some POS tag counts are slightly correlated with target, but none are significant.

## Readability Formulas

In [None]:
readability_df = pd.DataFrame()

# Coleman-Liau Index
readability_df['cli'] = (0.0588 * meta_df['mean_chars_per_word']*100) - (0.296 * 100/meta_df['mean_words_per_sent']) - 15.8

# Flesch-kincaid Grade Level
readability_df['fkgl'] = (0.39 * meta_df['mean_words_per_sent']) + (11.8 * meta_df['mean_syllables']) - 15.59

# Flesch Reading Ease
readability_df['fre'] = 206.835 - (84.6 * (meta_df['mean_syllables'] / meta_df['mean_words_per_sent'])) - (1.015 * (meta_df['mean_words_per_sent'] / meta_df['sent_count']))

# Simple Measure of Gobbledygook
readability_df['smog'] = (1.043 * np.sqrt(meta_df['polysyllables'] * (30 / meta_df['sent_count']))) + 3.1291

# Gunning Fog Index
readability_df['gfi'] = 0.4 * ((meta_df['word_count'] / meta_df['sent_count']) + ((meta_df['polysyllables'] / meta_df['word_count']) * 100))

### Visualizations

In [None]:
readability_df.head()

In [None]:
fig, ax = plt.subplots(1, len(readability_df.columns), figsize=(16,6))
for i, column in enumerate(readability_df.columns):
    plot = sns.boxenplot(
        data=readability_df,
        y=column,
        ax=ax[i],
        linewidth=1,
        width=.5,
    )
    plot.set_ylabel(f'\n{column}', fontsize=14)
fig.suptitle('Boxenplots for readability formulas', fontsize=12)
fig.tight_layout()
fig.show()

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(12,8), sharey=True)
for i, column in enumerate(readability_df.columns):
    plot = sns.scatterplot(
        data=readability_df,
        x=column,
        y=df['target'],
        ax=ax[i//3, i%3],
        linewidth=0,
        alpha=0.2,
    )
    plot.set_xlabel(f'{column}', fontsize=14)
ax[1,2].axis('off')
fig.suptitle('Scatterplots for readability formulas against target', fontsize=16)
fig.tight_layout()
fig.show()

In [None]:
fig = plt.figure(figsize=(8,5))
mask = np.triu(np.ones_like(readability_df.corr(), dtype=np.bool))
plot = sns.heatmap(
    data=readability_df.corr(),
    mask=mask,
    vmin=-1,
    vmax=1,
    annot=True,
    linewidth=2,
    square=True,
)
plot.set_title('Correlation heatmap of readability formulas', size=16)
fig.tight_layout()
fig.show()

The various readability formulas are highly correlated with eachother, except for `fre`.

In [None]:
corr = pd.concat([meta_df, readability_df], axis=1).corr().drop(readability_df.columns, axis=0).drop(meta_df.columns, axis=1)
fig = plt.figure(figsize=(10,10))
plot = sns.heatmap(
    data=corr,
    vmin=-1,
    vmax=1,
    linewidth=2,
    annot=True,
    square=True,
)
plot.set_xlabel('Readability formulas', size=14)
plot.set_ylabel('Excerpt metadata', size=14)
plot.set_title('Heatmap of readability formula correlation with excerpt metadata', size=16)
fig.tight_layout()

As the readability formulas use the excerpt metadata, it would be expected that they are highly correlated with each other. This is especially the case for `mean_chars_per_word`, `mean_syllables`, and `polysyllables` with all readability formulas except `fre`. `fre` is highly negatively correlated with `sent_count`.

# Final Data Preparation

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

## Creating training and test set

In [None]:
X = pd.concat([meta_df, pos_tag_counts_df, readability_df], axis=1).copy()
# Remove features that are too dependent on excerpt length
X = X.drop(['word_count', 'sent_count'], axis=1)

# Remove tags that don't appear in many excerpts
dropped_tags = [tag for tag in non_zero_pos_tag_counts_df['tag'] if tag not in list(USEFUL_TAGS)]
X = X.drop(dropped_tags, axis=1)

y = pd.DataFrame(df['target'])

# Scale features in X
scaler = RobustScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=75)
for name, split in {
    'X_train': X_train, 
    'y_train': y_train, 
    'X_test': X_test, 
    'y_test': y_test,
}.items():
    descripter = f'The shape of {name} is:'
    print(f'{descripter:<24} {split.shape}')
X_train.head()

# Modelling

## Imports

In [None]:
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import GridSearchCV

## Modelling Helper Function

In [None]:
def fit_score_model(regressor):
    """
    Fits and scores a model by predicting the test set and calculating accuracy as RMSE
    
    Keyword Parameters:
        regressor: the model to use
        
    Returns:
        result (dict): a dictionary with the model and accuracy
    """
    model = regressor.fit(X_train, np.ravel(y_train))

    y_pred = model.predict(X_test)
    accuracy = mean_squared_error(y_test, y_pred, squared=False)

    return {'model': model, 'accuracy': accuracy}

In [None]:
def model_results(name, regressor, param_grid=None):
    model = fit_score_model(regressor)
    print(f'{name} RMSE no tuning: {round(model["accuracy"], 3)}')

    # Parameter tuning
    if param_grid is not None:
        regressor = GridSearchCV(
            regressor, 
            param_grid,
        )
        tuned_model = fit_score_model(regressor)
        print(f'{name} RMSE: {round(tuned_model["accuracy"], 3)}')
        return tuned_model
    else:
        return model

## Linear Regression

In [None]:
linear_model = model_results('Linear Regression', LinearRegression())

## Ridge

In [None]:
parameters = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
}
ridge_model = model_results('Ridge', Ridge(), parameters)

## Lasso

In [None]:
parameters = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
}
lasso_model = model_results('Lasso', Lasso(), parameters)

## Elastic-Net

In [None]:
parameters = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'l1_ratio': [0.125, 0.25, 0.5, 0.75, 0.875],
}
elastic_model = model_results('Elastic-Net', ElasticNet(), parameters)

## Stochastic Gradient Descent

In [None]:
param_grid = [
    {'penalty': ['l2', 'l1'], 'alpha': [0.01, 0.1, 1.0, 10.0],},
    {'penalty': ['elasticnet'], 'alpha': [0.01, 0.1, 1.0, 10.0], 'l1_ratio': [0.125, 0.25, 0.5, 0.75, 0.875],}
]
sgd_model = model_results('Stochastic Gradient Descent', SGDRegressor(random_state=54), param_grid)

## Support Vector Machine

In [None]:
linear_svm_model = model_results('Linear Support Vector Machine', SVR(kernel='linear'))

In [None]:
parameters = {
    'degree': [2,3,4,5],
    'gamma': ['scale', 'auto'],
}
poly_svm_model = model_results('Poly Support Vector Machine', SVR(kernel='poly'), parameters)

## K-Nearest Neighbors

In [None]:
parameters = {
    'n_neighbors': [1,2,4,8,16,32,64],
    'weights': ['uniform', 'distance'],
}
knearest_model = model_results('K-Nearest Neighbors', KNeighborsRegressor(), parameters)

The currently best performing model is Ridge, with an RMSE of 0.756.

In [None]:
SELECTED_MODEL = ridge_model

# Creating submission file

In [None]:
submission_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

SUBMISSION_CLEANED_EXCERPTS = submission_df['excerpt'].transform(lambda excerpt: [clean(sentence) for sentence in sent_tokenize(excerpt)])

submission_df['char_count'] = SUBMISSION_CLEANED_EXCERPTS.transform(lambda excerpt: sum([len(word) for sentence in excerpt for word in sentence]))
submission_df['word_count'] = SUBMISSION_CLEANED_EXCERPTS.transform(lambda excerpt: len([word for sentence in excerpt for word in sentence]))
submission_df['sent_count'] = SUBMISSION_CLEANED_EXCERPTS.transform(len)
submission_df['mean_words_per_sent'] = SUBMISSION_CLEANED_EXCERPTS.transform(mean_words_per_sent)
submission_df['mean_chars_per_word'] = SUBMISSION_CLEANED_EXCERPTS.transform(mean_chars_per_word)
submission_df['mean_syllables'] = SUBMISSION_CLEANED_EXCERPTS.transform(mean_syllable_count_per_word)
submission_df['polysyllables'] = SUBMISSION_CLEANED_EXCERPTS.transform(polysyllables)


SUBMISSION_POS_TAGGED_EXCERPTS = submission_df['excerpt'].transform(excerpt_pos_tag)
SUBMISSION_POS_TAG_COUNTS = SUBMISSION_POS_TAGGED_EXCERPTS.transform(tag_count)
submission_tag_counts_df = SUBMISSION_POS_TAG_COUNTS.apply(pd.Series).fillna(0).astype(int)
submission_tag_counts_df = submission_tag_counts_df.drop([tag for tag in submission_tag_counts_df.columns if tag not in list(USEFUL_TAGS)], axis=1)
submission_df = pd.concat([submission_df, submission_tag_counts_df], axis=1)


submission_df['cli'] = (0.0588 * submission_df['mean_chars_per_word']*100) - (0.296 * 100/submission_df['mean_words_per_sent']) - 15.8
submission_df['fkgl'] = (0.39 * submission_df['mean_words_per_sent']) + (11.8 * submission_df['mean_syllables']) - 15.59
submission_df['fre'] = 206.835 - (84.6 * (submission_df['mean_syllables'] / submission_df['mean_words_per_sent'])) - (1.015 * (submission_df['mean_words_per_sent'] / submission_df['sent_count']))
submission_df['smog'] = (1.043 * np.sqrt(submission_df['polysyllables'] * (30 / submission_df['sent_count']))) + 3.1291
submission_df['gfi'] = 0.4 * ((submission_df['word_count'] / submission_df['sent_count']) + ((submission_df['polysyllables'] / submission_df['word_count']) * 100))

submission_df = submission_df.drop(['url_legal', 'license', 'excerpt'], axis=1)
submission_df = submission_df.drop(['word_count', 'sent_count'], axis=1)

ids = submission_df['id']
submission_df = submission_df.drop('id', axis=1)

submission_df[submission_df.columns] = scaler.transform(submission_df[submission_df.columns])

submission_df.shape

In [None]:
final = pd.DataFrame([ids, SELECTED_MODEL['model'].predict(submission_df)])
final = final.transpose()
final.columns = ['id', 'target']
final.index= final['id']
final = final.drop('id', axis=1)
final

In [None]:
final.to_csv('submission.csv')