In [None]:
import re
import pandas as pd
import numpy as np

from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor

In [None]:
GROUPING_SPACE_REGEX = re.compile(r'([^\w]|[\d]|[_]|[+])', re.UNICODE)

def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split):
    return [t for t in _split(text.lower()) if t and not t.isspace()]

def token_r(text):
    words = simple_word_tokenize(text)
    return [Word(x).lemmatize() for x in words if len(x)>=2]

In [None]:
train_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [None]:
submission = test_data[['id']].copy()
submission['target'] = 0
submission.set_index('id', drop=True, inplace=True)

In [None]:
def fill(data):
    data['license'] = data['license'].str.replace('-', ' ')
    data['url_legal'] = data['url_legal'].fillna('nourl')
    data['license'] = data['license'].fillna('No License')
    data['url_legal'] = data['url_legal'].str.replace('https://', '')
    data['url_legal'] = data['url_legal'].fillna('')
    data['excerpt'] = data['excerpt'].str.replace('\n', '')
    
    for license_name in train_data.license.unique():
        if license_name != 'No License':
            data[f'{license_name}: contained'] = data['license']\
            .apply(lambda x: 1 if x == license_name else 0)

In [None]:
fill(train_data)
fill(test_data)

In [None]:
TOKEN_REGEX = re.compile(r'\W+', re.UNICODE)
def sw_tokenize(text, _split=TOKEN_REGEX.split):
    return [t for t in _split(text.lower()) if t and not t.isspace()]

def tokenize(text):
    words = sw_tokenize(text)
    return [Word(x).lemmatize() for x in words]

def count_words(text):
    return len(tokenize(text))

def count_symbols(text, symbol='[,]'):
    return len(re.findall(symbol, text))

def mean_sentence_len(text):
    sum_words = count_words(text)
    sum_senteces = count_symbols(text.replace('...', '.'), '[.]')
    return round(sum_words/sum_senteces, 4)

def mean_word_len(text):
    _list = tokenize(text)
    _sum = np.array([len(x) for x in _list]).sum()
    return round(_sum/len(_list), 4)

In [None]:
def process_stats(data):
    data['len_text'] = data.excerpt.apply(count_words)
    data['comma_amount'] = data.excerpt.apply(count_symbols)
    data['mean_sentence_len'] = data.excerpt.apply(mean_sentence_len)
    data['mean_word_len'] = data.excerpt.apply(mean_word_len)

In [None]:
process_stats(train_data)
process_stats(test_data)

In [None]:
cv = CountVectorizer(tokenizer=token_r)

matrix_urls = cv.fit_transform(train_data.url_legal).toarray()
matrix_test = cv.transform(test_data.url_legal).toarray()

In [None]:
tfidf = TfidfVectorizer(tokenizer=token_r, ngram_range=(1, 1))

matrix_texts = tfidf.fit_transform(train_data.excerpt).toarray()
m_texts_test = tfidf.transform(test_data.excerpt).toarray()

In [None]:
def concat_df(matrix_url, matrix_text):
    df_urls = pd.DataFrame(data=matrix_url, columns=[f'{x}:url' for x in cv.get_feature_names()])\
                .drop(['en:url', 'nourl:url'], axis=1)
    df_texts = pd.DataFrame(data=matrix_text, columns=[f'{x}:text' for x in tfidf.get_feature_names()])

    return pd.concat([df_urls, df_texts], axis=1)

In [None]:
df_merged = concat_df(matrix_urls, matrix_texts)
df_merged_test = concat_df(matrix_test, m_texts_test)

In [None]:
def targeting(data):
    data['target_min'] = data['target'] - data['standard_error']
    data['target_max'] = data['target'] + data['standard_error']

In [None]:
targeting(train_data)

In [None]:
to_drop = ['url_legal', 'license', 'excerpt', 'standard_error']

data_train = pd.concat([train_data.drop(to_drop, axis=1), df_merged], axis=1)
data_test = pd.concat([test_data.drop(['url_legal', 'license', 'excerpt'], axis=1), df_merged_test], axis=1)

In [None]:
data_train.set_index('id', drop=True, inplace=True)
data_test.set_index('id', drop=True, inplace=True)

In [None]:
X_train, X_test, y_train = data_train.drop(['target', 'target_min', 'target_max'], axis=1), data_test, data_train.target

In [None]:
y_train_min = data_train.target_min.values
y_train_max = data_train.target_max.values

In [None]:
lgbm_min = LGBMRegressor(random_state=42)

lgbm_min.fit(X_train.values, y_train_min)

y_pred_min = lgbm_min.predict(X_test.values)

In [None]:
lgbm_max = LGBMRegressor(random_state=42)

lgbm_max.fit(X_train.values, y_train_max)

y_pred_max = lgbm_max.predict(X_test.values)

In [None]:
y_pred_mean = (y_pred_max + y_pred_min)/2

In [None]:
sample = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")[['id']]
sample['target'] = y_pred_mean
sample.to_csv('submission.csv', index=False)