In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The following code installs the spacy pretrained embeddings

In [None]:
!mkdir -p /tmp/pip/cache/

In [None]:
#!cp ../input/spacyword2vec/en_core_web_md-2.3.1.xyz /tmp/pip/cache/en_core_web_md-2.3.1.tar.gz

In [None]:
!cp ../input/spacy-encore-web-large/en_core_web_lg-2.2.5.tar.xyz /tmp/pip/cache/en_core_web_lg-2.2.5.tar.gz

In [None]:
#!pip install /tmp/pip/cache/en_core_web_md-2.3.1.tar.gz

In [None]:
!pip install /tmp/pip/cache/en_core_web_lg-2.2.5.tar.gz

In [None]:
#!pip install ../input/textstat/Pyphen-0.10.0-py3-none-any.whl

In [None]:
#!pip install ../input/textstat/textstat-0.7.0-py3-none-any.whl

Import the required libraries

In [None]:
import spacy
#import en_core_web_md
import en_core_web_lg
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import nltk
from nltk import pos_tag, word_tokenize
from sklearn.base import BaseEstimator, TransformerMixin
#from textstat import flesch_reading_ease, flesch_kincaid_grade

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

In [None]:
df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')

In [None]:
#df['flesch_reading_ease'] = df['excerpt'].apply(lambda x: flesch_reading_ease(x))
#df['flesch_kincaid_grade'] = df['excerpt'].apply(lambda x: flesch_kincaid_grade(x))

Create another feature which has the POS tag for every word in the given excerpt.

In [None]:
def get_pos_tags(row):
  word_tokenized = word_tokenize(row)
  pos_tags = pos_tag(row)
  pos_text = " ".join([x[1] for x in pos_tags])
  return pos_text 
df['pos_tag'] = df['excerpt'].apply(get_pos_tags)

In [None]:
nlp = spacy.load('en_core_web_lg')
def get_tok_lefts(row):
  out = nlp(row)
  #tokens = ''
  #for token in out.doc:
  #  tokens = tokens + ' ' + token.dep_
  return np.array([len(list(token.lefts)) for token in out.doc if not token.is_punct]).sum()


def get_tok_rights(row):
  out = nlp(row)
  #tokens = ''
  #for token in out.doc:
  #  tokens = tokens + ' ' + token.dep_
  return np.array([len(list(token.rights)) for token in out.doc if not token.is_punct]).sum()

In [None]:
df['token_lefts'] = df['excerpt'].apply(get_tok_lefts)
df['token_rights'] = df['excerpt'].apply(get_tok_rights)

Create a class that is used to convert the text to word embeddings

In [None]:
class WordVectorTransformer(TransformerMixin, BaseEstimator):
  def __init__(self, model='en_core_web_md'):
    self.model = model

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    #nlp = en_core_web_md.load()
    nlp = en_core_web_lg.load()
    #print(X)
    embed_text = np.concatenate([nlp(doc).vector.reshape(1,-1) for doc in X])
    #print(f'{embed_text.shape},{embed_text}')
    return embed_text

In [None]:
X = df[['excerpt','pos_tag','token_lefts','token_rights']]#,'flesch_reading_ease']]
y = df['target']

Using Column Transformer from sklearn. The first transformer converts the excerpt into a embedding. The second transformer converts the POS tagged text using Tfidf Vectorizer

In [None]:
column_transform = ColumnTransformer([('tfidf',WordVectorTransformer(),'excerpt'),
                                      ('tfidf_pos',CountVectorizer(), 'pos_tag')],
                                     remainder="passthrough"
                                     )

The final pipeline is built using the Stacking Regressor from sklearn. Ridge regressor, XGboost, Light GBM and CatBoost regressors are used in the stacking ensemble.

In [None]:
model = Pipeline([('preprocess', column_transform),
                    ('model', StackingRegressor([
                                              ('ridge', Ridge(alpha=0.8, solver='sag', max_iter=2000)),
                                              ('xgb',XGBRegressor()),
                                              ('lgbm',LGBMRegressor()),
                                              ('catboost',CatBoostRegressor())
                                              ])
                    )
                    ])

model.fit(X, y)

In [None]:
df_test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
df_test['pos_tag'] = df_test['excerpt'].apply(get_pos_tags)
#df_test['flesch_reading_ease'] = df_test['excerpt'].apply(lambda x: flesch_reading_ease(x))
df_test['token_lefts'] = df_test['excerpt'].apply(get_tok_lefts)
df_test['token_rights'] = df_test['excerpt'].apply(get_tok_rights)
X_test = df_test[['excerpt','pos_tag','token_lefts','token_rights']]#,'flesch_reading_ease']]
preds = model.predict(X_test)

In [None]:
output = pd.DataFrame({'id': df_test['id'],
                       'target': preds})

In [None]:
output.to_csv('submission.csv', index=False)