# CommonLit Readability
### With simple features and preprocessing

In [None]:
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from tqdm.notebook import tqdm

from statsmodels import api as smf
from scipy import stats as sts
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
plt.style.use('seaborn')

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [None]:
RANDOM_STATE = 42  # set the random state

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test.head()

In [None]:
sample_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sample_submission.head()

## EDA

In [None]:
train.describe()

### Take a look on some visualizations

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(train.target, bins=40, kde=True)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(train.standard_error, bins=40, kde=True)
plt.show()

### We have a long tail in std, so let's count it as an outlier

In [None]:
train.isna().sum(axis=0) / len(train)  # 70% of NaNs

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(train.isna())
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(train.excerpt.apply(len), bins=40, kde=True)
plt.show()

In [None]:
all(train.isna()['url_legal'] == train.isna()['license']) == True  # both have NaNs at the same posistions

In [None]:
train['text_length'] = train['excerpt'].apply(len)  # new simple feature, text length
train.head()

In [None]:
def count_punctuation(text, which=['.', ',', r'"', ':', ';', '!', '?', r"'", r'[', r']', r'{', r'}']):
    count = 0
    for symbol in which:
        count += text.count(symbol)
        
    return count

In [None]:
train['punct_count'] = train['excerpt'].apply(count_punctuation)  # simple feature, punctuation count
train.head()

In [None]:
plt.figure(figsize=(40, 40))
sns.pairplot(train[['target', 'standard_error', 'text_length', 'punct_count']])
plt.show()

### As mentioned above, here's outlier, let us get rid of it

In [None]:
train = train[train['standard_error'] > 0.4]

In [None]:
# new binary feature, that shows whether licence available or not
train['license_available'] = (~train['license'].isna()).astype(int)

In [None]:
plt.figure(figsize=(40, 40))
sns.pairplot(train[['target', 'standard_error', 'text_length', 'punct_count', 'license_available']], hue='license_available')
plt.show()

In [None]:
for column in ['target', 'standard_error', 'text_length', 'punct_count']:
    plt.figure(figsize=(10, 6))
    sns.boxplot(train[column])
    plt.pause(1)

## Baseline OLS

In [None]:
x_ols = train[['text_length', 'punct_count', 'license_available']]
y_ols = train['target']


model = smf.OLS(y_ols, x_ols)
res = model.fit()
res.summary()

In [None]:
# function for cross validation
def cross_validate(model, x, y, n_splits=5):
    metrics = []
    kfolds = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    for train_index, test_index in kfolds.split(x):
        print('Fitting...')
        x_tr, y_tr = x[train_index], y[train_index]
        x_vl, y_vl = x[test_index], y[test_index]
        
        model.fit(x_tr, y_tr)
        preds = model.predict(x_vl)
        metrics.append(mean_squared_error(y_vl, preds, squared=False))
    return metrics

In [None]:
# simple LinReg
model = LinearRegression()
metrics_baseline = cross_validate(model, x_ols.values, y_ols.values)
np.mean(metrics_baseline)

In [None]:
test['text_length'] = test['excerpt'].apply(len)
test['license_available'] = (~test['license'].isna()).astype(int)
test['punct_count'] = test['excerpt'].apply(count_punctuation)
test.head()

In [None]:
x_test_ols = test[['text_length', 'punct_count', 'license_available']]
baseline_submission = test[['id']].copy()
baseline_submission['target'] = model.predict(x_test_ols)
# baseline_submission.to_csv('submission.csv', index=False)
baseline_submission  # 0.977 public, moving

## Text features

In [None]:
def tokenize_text(text):
    return re.findall(r'[\w]+', text.lower())

def tokenize_corpus(corpus):
    return [tokenize_text(text) for text in tqdm(corpus)]

def join_tokenized_sentences(tokenized_sentences):
    sentence_list = []
    for doc in tqdm(tokenized_sentences):
        sentence = str()
        for token in doc:
            sentence += token + ' '
        sentence_list.append(sentence.rstrip())
    return sentence_list

In [None]:
corpus_train = train['excerpt'].values
docs_train = tokenize_corpus(corpus_train)

counter = Counter()
for sentence in tqdm(docs_train):
    for word in sentence:
        counter[word] += 1
        
words = np.array(counter.most_common())
subset_of_rare_words = set(words[words[:, 1].astype(int) < 200][:, 0])  # threshold for this may be different
subset_of_frequent_words = set(words[words[:, 1].astype(int) > 1500][:, 0])

In [None]:
def check_if_rare_words(text, subset):
    amount_of_rare_words = 0
    for word in text:
        if word.lower() in subset:
            amount_of_rare_words += 1
    return amount_of_rare_words

def check_if_rare_in_corpus(corpus, subset):
    return [check_if_rare_words(text, subset) for text in tqdm(corpus)]
# create some text features
train['amount_of_rare_words'] = check_if_rare_in_corpus(train.excerpt.values, subset_of_rare_words)
train['amount_of_frequent_words'] = check_if_rare_in_corpus(train.excerpt.values, subset_of_frequent_words)

In [None]:
# and some more simple features, count of '\n' and amount of capital letters
train['\n_amount'] = train['excerpt'].apply(lambda x: x.count('\n'))
train['capital_let_amount'] = train['excerpt'].apply(lambda x: len(re.findall(r'[A-Z]+', x)))

In [None]:
repeating = []
for doc in docs_train:
    repeating.append(len(doc) - len(set(doc)))
train['repeating'] = repeating  # measure of "repeat" of the text

In [None]:
mean_word_len = []
for doc in tqdm(docs_train):
    len_w = 0
    for word in doc:
        len_w += len(word)
    mean_word_len.append(len_w / len(doc))
train['mean_word_len'] = mean_word_len  # and mean word len in each text

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(train.corr(), annot=True, cmap='viridis');

In [None]:
x_ols = train[[
    'text_length',
    'punct_count',
    'license_available',
    'amount_of_rare_words',
    'amount_of_frequent_words',
    '\n_amount',
    'capital_let_amount',
    'repeating',
    'mean_word_len'
]]
y_ols = train['target']

scaler = StandardScaler()  # scaling
scaler.fit(x_ols)
x_ols = scaler.transform(x_ols)

# model = Ridge(0.15)
model = XGBRegressor(n_estimators=700, max_depth=5, learning_rate=0.005, n_jobs=-1)  # there was GridSearch for params
metrics_baseline = cross_validate(model, x_ols, y_ols.values)
print(np.mean(metrics_baseline))

In [None]:
# do all the same for test set

corpus_test = test['excerpt'].values
docs_test = tokenize_corpus(corpus_test)

test['amount_of_rare_words'] = check_if_rare_in_corpus(test.excerpt.values, subset_of_rare_words)
test['amount_of_frequent_words'] = check_if_rare_in_corpus(test.excerpt.values, subset_of_frequent_words)
test['\n_amount'] = test['excerpt'].apply(lambda x: x.count('\n'))
test['capital_let_amount'] = test['excerpt'].apply(lambda x: len(re.findall(r'[A-Z]+', x)))
repeating = []
for doc in docs_test:
    repeating.append(len(doc) - len(set(doc)))
test['repeating'] = repeating

mean_word_len = []
for doc in tqdm(docs_test):
    len_w = 0
    for word in doc:
        len_w += len(word)
    mean_word_len.append(len_w / len(doc))
test['mean_word_len'] = mean_word_len
test.head()

In [None]:
model.fit(x_ols, y_ols.values)  # fit on all data

In [None]:
x_test_freq_words = test[
    [
        'text_length',
        'punct_count',
        'license_available',
        'amount_of_rare_words',
        'amount_of_frequent_words',
        '\n_amount',
        'capital_let_amount',
        'repeating',
        'mean_word_len'
    ]
]
x_test_freq_words = scaler.transform(x_test_freq_words)
submission_text = test[['id']].copy()
submission_text['target'] = model.predict(x_test_freq_words)
# submission_text.to_csv('submission.csv', index=False)
submission_text

## NLP

In [None]:
# stemming and deleting stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.snowball.SnowballStemmer('english')
docs_train = [[stemmer.stem(token) for token in text if token not in stopwords] for text in tqdm(docs_train)]

train['cleared_text'] = join_tokenized_sentences(docs_train)

In [None]:
# tfidf vectorizer for ngrams in range 2-4
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(2, 4),
                            stop_words=stopwords, min_df=5, max_df=0.9, analyzer='char_wb')
vectorizer.fit(train['cleared_text'])

In [None]:
x_texts = vectorizer.transform(train['cleared_text'])

In [None]:
model = Ridge(1.01)
text_ols_results = cross_validate(model, x_texts, y_ols.values, 5)
np.mean(text_ols_results)

In [None]:
test.head()

In [None]:
model.fit(x_texts, y_ols.values)  # fit on all data

In [None]:
plt.scatter(y_ols.values, model.predict(x_texts))

In [None]:
mean_squared_error(y_ols.values, model.predict(x_texts), squared=False)  # train rmse

In [None]:
docs_test = [[stemmer.stem(token) for token in text if token not in stopwords] for text in tqdm(docs_test)]

test['cleared_text'] = join_tokenized_sentences(docs_test)

In [None]:
x_texts_test = vectorizer.transform(test['cleared_text'])

In [None]:
baseline_submission_texts = test[['id']].copy()
baseline_submission_texts['target'] = model.predict(x_texts_test)
# baseline_submission_texts.to_csv('submission.csv', index=False)
baseline_submission_texts  # 0.807

In [None]:
# and final step - blending XGBoost on simple feats and Ridge on tfidf feats
submission = baseline_submission.copy()
submission['target'] = 0.05*submission_text['target'] + 0.95*baseline_submission_texts['target']
submission.to_csv('submission.csv', index=False)
submission  # 0.725 public