In [None]:
!pip download textstat
!pip install textstat

!pip install spacy
!python3 -m spacy download en_core_web_sm
!pip install textblob

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.preprocessing import KBinsDiscretizer
import textstat
from textblob import TextBlob
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance


class regressor_stratified:
    def __init__(self,n_splits=1,group_count=10,random_state=42,strategy='quantile',val_size=0.2):
        self.group_count=group_count
        self.strategy=strategy
        self.cvkwargs=dict(n_splits=n_splits,test_size=val_size,random_state=random_state)
        self.cv=StratifiedShuffleSplit(**self.cvkwargs)
        self.discretizer=KBinsDiscretizer(n_bins=self.group_count,encode='ordinal',strategy=self.strategy)  
            
    def split(self,X,y,groups=None):
        kgroups=self.discretizer.fit_transform(y[:,None])[:,0]
        return self.cv.split(X,kgroups,groups)
    
    def get_n_splits(self,X,y,groups=None):
        return self.cv.get_n_splits(X,y,groups)

In [None]:
!ls ..
df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
df.head()

In [None]:
df['char_cnt'] = df.excerpt.str.len()
df['syllable_cnt'] = df.excerpt.apply(textstat.syllable_count)
df['word_cnt'] = df.excerpt.apply(textstat.lexicon_count)
df['sent_cnt'] = df.excerpt.apply(textstat.sentence_count)
df['flesch_reading_ease'] = df.excerpt.apply(textstat.flesch_reading_ease)
df['flesch_kincaid_grade'] = df.excerpt.apply(textstat.flesch_kincaid_grade)
df['gunning_fog'] = df.excerpt.apply(textstat.gunning_fog)
df['ari'] = df.excerpt.apply(textstat.automated_readability_index)
df['cli'] = df.excerpt.apply(textstat.coleman_liau_index)
df['lwf'] = df.excerpt.apply(textstat.linsear_write_formula)
df['dcrs'] = df.excerpt.apply(textstat.dale_chall_readability_score)



In [None]:
def determine_polarity_and_subjectivity_stats(text):
    tb = TextBlob(text)
    polarities = []
    subjectivities = []
    for sentence in tb.sentences:
        polarities.append(sentence.polarity)
        subjectivities.append(sentence.subjectivity)
            
    return np.mean(polarities), np.median(polarities), np.ptp(polarities), np.percentile(polarities, 75) - np.percentile(polarities, 25), np.std(polarities), \
        np.mean(subjectivities), np.median(subjectivities), np.ptp(subjectivities), np.percentile(subjectivities, 75) - np.percentile(subjectivities, 25), np.std(subjectivities)

    
df['pol_mean'], df['pol_median'], df['pol_range'], df['pol_cutted_range'], df['pol_std'], \
df['subj_mean'], df['subj_median'], df['subj_range'], df['subj_cutted_range'], df['subj_std'] = zip(*df.excerpt.apply(determine_polarity_and_subjectivity_stats))

In [None]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

df['avg_word'] = df.excerpt.apply(avg_word)


In [None]:
stop = stopwords.words('english')

df['stopwords_cnt'] = df.excerpt.apply(lambda x: len([x for x in x.split() if x in stop]))
df['avg_sent'] = df['word_cnt']/df['sent_cnt']
df['stopwords_vs_words'] = df['stopwords_cnt']/df['word_cnt']


In [None]:
sp = spacy.load('en_core_web_sm')

# NOUN, PRON, PROPN, ADJ, NUM, PUNCT
pos_ids_to_look_for = [92, 95, 96, 84, 93, 97]
def look_for_pos_freq(text):
    text = sp(text)
    pos_freq_dict = text.count_by(spacy.attrs.POS)
    pos_freq_to_look_for = [pos_freq_dict.get(pos_id, 0) for pos_id in pos_ids_to_look_for]
    return pos_freq_to_look_for


df['noun'], df['pron'], df['propn'], df['adj'], df['num'], df['punct']= zip(*df.excerpt.apply(look_for_pos_freq))

df.head(1)

In [None]:
cv = regressor_stratified_cv(group_count=10,random_state=42,strategy='uniform')

train, val = next(cv.split(df[['excerpt']], df['target']))

df_train = df.loc[df.index.isin(train)]
df_train.target.plot(kind='kde')

df_val = df.loc[df.index.isin(val)]
df_val.target.plot(kind='kde')

print(len(df_train), len(df_val))

X_train, X_val = df_train.drop(['target'], axis=1), df_val.drop(['target'], axis=1)
y_train, y_val = df_train['target'], df_val['target']


In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit(train_X.excerpt)


In [None]:
X_train['tfidf'] = X_train.excerpt.apply(lambda t: vectorizer.transform([t]).data)
X_val['tfidf'] = X_val.excerpt.apply(lambda t: vectorizer.transform([t]).data)


In [None]:
X_train['tfidf_mean'] = X_train.tfidf.apply(np.mean)
X_val['tfidf_mean'] = X_val.tfidf.apply(np.mean)
X_train['tfidf_median'] = X_train.tfidf.apply(np.median)
X_val['tfidf_median'] = X_val.tfidf.apply(np.median)
X_train['tfidf_std'] = X_train.tfidf.apply(np.std)
X_val['tfidf_std'] = X_val.tfidf.apply(np.std)
X_train['tfidf_range'] = X_train.tfidf.apply(np.ptp)
X_val['tfidf_range'] = X_val.tfidf.apply(np.ptp)

X_train['tfidf_cutted_range'] = X_train.tfidf.apply(lambda l: np.percentile(l, 75) - np.percentile(l, 25))
X_val['tfidf_cutted_range'] = X_val.tfidf.apply(lambda l: np.percentile(l, 75) - np.percentile(l, 25))

In [None]:
X_train = X_train.drop(['id', 'url_legal', 'license', 'excerpt', 'standard_error', 'tfidf'], axis=1)
X_val = X_val.drop(['id', 'url_legal', 'license', 'excerpt', 'standard_error', 'tfidf'], axis=1)

print(X_train.columns, len(X_train.columns) == len(X_val.columns))


In [None]:
print(X_train.values.shape, y_train.values.shape)
print(type(X_train.values), type(y_train.values))
print(X_train.values[0])

In [None]:
param_grid = {
    "n_estimators": np.arange(100, 1000, step=100),
    "max_depth": np.arange(3, 8, step=1),
    "min_samples_split": np.arange(2, 10, step=2),
    'learning_rate': np.arange(1, 20, step=5) / 100,
    'loss': ['ls', 'lad', 'huber']
}

reg = GradientBoostingRegressor()
random_search = RandomizedSearchCV(reg, param_grid, n_iter=20, n_jobs=-1, random_state=42)
random_search.fit(X_train.values, y_train.values)

print('Best params:')
print(random_search.best_params_)

In [None]:
reg = random_search.best_estimator_
rmse = np.sqrt(mean_squared_error(y_val, reg.predict(X_val.values)))
print("The root mean squared error (RMSE) on val set: {:.4f}".format(rmse))


reg2 = GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=3, learning_rate=0.01, loss='ls')
reg2.fit(X_train.values, y_train.values)
rmse = np.sqrt(mean_squared_error(y_val, reg2.predict(X_val.values)))
print("The root mean squared error (RMSE) on val set: {:.4f}".format(rmse))


In [None]:
feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance (MDI)')

result = permutation_importance(reg2, X_val, y_val, n_repeats=10,
                                random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(result.importances[sorted_idx].T,
            vert=False, labels=np.array(X_train.columns)[sorted_idx])
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()