# Линейная регрессия на нескольких фичах

## Установка spaCy и загрузка модели без интернета на Каггле

In [None]:
%%time
!pip uninstall fastai en-core-web-sm en-core-web-lg spacy -y -q

In [None]:
%%time
!pip install ../input/spacy3/catalogue-2.0.3-py3-none-any.whl ../input/spacy3/typer-0.3.2-py3-none-any.whl ../input/spacy3/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/pathy-0.5.2-py3-none-any.whl ../input/spacy3/smart_open-3.0.0-py3-none-any.whl ../input/spacy3/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_legacy-3.0.5-py2.py3-none-any.whl -q

In [None]:
%%time
!pip install ../input/spacy3/spacy_alignments-0.8.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_transformers-1.0.2-py2.py3-none-any.whl ../input/spacy3/en_core_web_trf-3.0.0-py3-none-any.whl -q

In [None]:
%%time

from collections import Counter
import itertools
import pickle
from typing import List, Tuple

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import scipy
from scipy import stats

import sklearn
from sklearn import (
    dummy,
    ensemble,
    feature_selection,
    metrics,
    pipeline,
)

from tqdm.auto import tqdm
tqdm.pandas()

import spacy

import gensim
from gensim import downloader
print(f'Gensim version: {gensim.__version__}')

import torch
assert torch.cuda.is_available()

import tensorflow as tf
print(f'TF version: {tf.__version__}')
print(f'Eager mode: {tf.executing_eagerly()}')
print(f'GPU: {"is available" if tf.config.experimental.list_physical_devices("GPU") else "IS NOT AVAILABLE"}')

import transformers
print(f'Hugging Face version: {transformers.__version__}')

## Загружаю предобученные модели

In [None]:
%%time
def load_spacy_model():
    #spacy.require_gpu()
    return spacy.load('en_core_web_trf')  # roberta-base

spacynlp = load_spacy_model()

print(spacynlp.pipe_names)
spacynlp.disable_pipes([
    'ner',
    'attribute_ruler',
    'lemmatizer',
])
print(spacynlp.pipe_names)

In [None]:
%%time
def load_glove_counter_dict(filepath):
    f = open(filepath, 'r')
    counter_dict = {}
    idx = 0
    for line in tqdm(f):
        splitLines = line.split()
        word = splitLines[0]
        idx += 1
        counter_dict[word] = idx
    return counter_dict

glove_counter_dict = load_glove_counter_dict('../input/glove-embeddings/glove.6B.50d.txt')

In [None]:
%%time
def load_gpt_tokenizer_and_model(filepath):
    tokenizer = transformers.GPT2Tokenizer.from_pretrained(filepath)
    model = transformers.GPT2LMHeadModel.from_pretrained(filepath)
    model = model.to(torch.device('cuda'))
    return tokenizer, model

gpt_tokenizer, gpt_model = load_gpt_tokenizer_and_model('../input/huggingface-distilgpt2')

## Прочитаю данные

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df.shape

## Добавлю вспомогательный столбец с бинами таргета

In [None]:
hist, bin_edges = np.histogram(
    df['target'],
    bins='doane',  # `sqrt`, `doane`, `sturges`, `rice`, `scott`, `fd`, `auto`
)
num_bins = len(hist)
print('Number of bins:', num_bins)

out, bins = pd.cut(
    df['target'],
    bins=num_bins,
    labels=[f'target_bin_{x}' for x in range(num_bins)],
    retbins=True,
)

plt.figure(figsize=(12, 5))
plt.hist(df['target'], bins=100)
for x_coord in bins:
    plt.axvline(x=x_coord, color='black')
plt.title('Target, binned with pd.cut')
plt.show()

df.loc[:, 'target_bin'] = out.astype(str)

## Добавлю вспомогательнный столбец с фолдами

In [None]:
df['fold'] = -1

train_size = 2_500

_, holdout_ids = sklearn.model_selection.train_test_split(
    df['id'],
    train_size=train_size,
    random_state=567,
    shuffle=True,
    stratify=df['target_bin'],
)

holdout_ids = holdout_ids.values
print('Validation frac:', len(holdout_ids) / len(df))

df.loc[df['id'].isin(holdout_ids), 'fold'] = 'holdout'
assert sum(df['fold'] == -1) == train_size

In [None]:
df = df.reset_index()

crossvalidation_df = df[df['fold'] != 'holdout'].reset_index(drop=True)
holdout_df = df[df['fold'] == 'holdout'].reset_index(drop=True)

n_splits = 5
skf = sklearn.model_selection.StratifiedKFold(
    n_splits=n_splits,
    shuffle=True,
    random_state=567,
)

for idx, (train_index, test_index) in enumerate(
    skf.split(X=crossvalidation_df,
              y=crossvalidation_df['target_bin'])):
    crossvalidation_df.loc[test_index, 'fold'] = f'fold_{idx}'

df = pd.concat([crossvalidation_df, holdout_df]).set_index('index').sort_index()

assert -1 not in df['fold']
for each in df['fold'].unique():
    current, rest = df[df['fold'] == each], df[df['fold'] != each]
    assert set(current.index).isdisjoint(rest.index)

In [None]:
print(Counter(df['fold']))

for each in sorted(df['fold'].unique()):
    plt.figure(figsize=(12, 1))
    plt.hist(df[df['fold'] == each]['target'], bins=20)
    plt.title(f'target, {each}')
    plt.show()

## Получу документы spaСy, они не быстро

In [None]:
%%time
def get_spacynlp_docs(df, spacynlp):
    docs = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        docs.append(spacynlp(row['excerpt']))
    return docs

spacynlp_docs = get_spacynlp_docs(df, spacynlp)

In [None]:
# with open('spacynlp_docs.pkl', 'wb') as f:
#     pickle.dump(spacynlp_docs, f)

# with open('spacynlp_docs.pkl', 'rb') as f:
#     spacynlp_docs = pickle.load(f)

## Добавлю фичи, частотности gloVe

In [None]:
%%time
def text_to_avg_word2vec_count(text, glove_counter_dict):
    bag_of_words_lowercased = gensim.utils.simple_preprocess(text)
    scores = []
    for word in bag_of_words_lowercased:
        position = None
        if word in glove_counter_dict:
            position = glove_counter_dict[word]
        scores.append(position)
    scores = [x for x in scores if x is not None]
    assert scores != []
    return np.mean(scores)

df['excerpt_avg_glove_count'] = df['excerpt'].progress_apply(lambda x: text_to_avg_word2vec_count(x, glove_counter_dict))

## Добавлю фичу, средняя глубина синтаксического дерева

In [None]:
%%time
def get_avg_tree_depth(spacynlp_doc):
    """
    https://gist.github.com/drussellmrichie/47deb429350e2e99ffb3272ab6ab216a
    """
    
    def tree_height(root):
        if not list(root.children):
            return 1
        else:
            return 1 + max(tree_height(x) for x in root.children)

    roots = [sent.root for sent in spacynlp_doc.sents]
    return np.mean([tree_height(root) for root in roots])

depths = []
for doc in tqdm(spacynlp_docs):
    depths.append(get_avg_tree_depth(doc))

df['excerpt_avg_max_tree_depth'] = depths

## Добавлю фичи, расстояние до main-verb

In [None]:
%%time
def get_avg_root_distance_from_sentence_begin(spacynlp_doc):
    distances = []
    for sent in doc.sents:
        for idx, token in enumerate(sent):
            if token.dep_ == 'ROOT':
                distances.append(idx + 1)
                break
    assert distances != []
    return np.mean(distances)

def get_avg_root_distance_from_sentence_end(spacynlp_doc):
    distances = []
    for sent in doc.sents:
        for idx, token in enumerate([token for token in sent][::-1]):
            if token.dep_ == 'ROOT':
                distances.append(idx + 1)
                break
    assert distances != []
    return np.mean(distances)

root_distances_from_sentence_begin = []
for doc in tqdm(spacynlp_docs):
    root_distances_from_sentence_begin.append(get_avg_root_distance_from_sentence_begin(doc))

root_distances_from_sentence_end = []
for doc in tqdm(spacynlp_docs):
    root_distances_from_sentence_end.append(get_avg_root_distance_from_sentence_end(doc))
    
df['excerpt_avg_root_distances_from_sentence_begin'] = root_distances_from_sentence_begin
df['excerpt_avg_root_distances_from_sentence_end'] = root_distances_from_sentence_end

## Добавлю фичи, перплексия насколько вероятно такое сгенерировать

In [None]:
%%time
def perplexity_sentence(sentence, gpt_tokenizer, gpt_model):
    tokens_tensor = gpt_tokenizer.encode('\n' +  sentence + '\n', add_special_tokens=False, return_tensors='pt')
    tokens_tensor = tokens_tensor.to(torch.device('cuda'))
    loss = gpt_model(tokens_tensor, labels=tokens_tensor)[0]
    n_tokens = tokens_tensor.shape[1] - 1
    return loss.cpu().detach().numpy() * n_tokens

x = [perplexity_sentence(x, gpt_tokenizer, gpt_model) for x in ['cat', 'student', 'sigmoid']]
assert x == sorted(x)

df['excerpt_perplexity'] = df['excerpt'].progress_apply(
    lambda x: perplexity_sentence(x,  gpt_tokenizer, gpt_model)
)

## Линейная регрессия

In [None]:
cv_iterator = []
for each in df['fold'].unique():
    if each != 'holdout':
        train_indices = df[~df['fold'].isin([each, 'holdout'])].index.values.astype(int)
        test_indices =  df[df['fold'].isin([each])].index.values.astype(int)
        cv_iterator.append( (train_indices, test_indices) )

In [None]:
wanted_cols = [
    'excerpt_avg_glove_count',
    'excerpt_avg_max_tree_depth',
    'excerpt_avg_root_distances_from_sentence_begin',
    'excerpt_avg_root_distances_from_sentence_end',
    'excerpt_perplexity',
]

In [None]:
dummy_reg = sklearn.dummy.DummyRegressor(strategy='mean')
scores = sklearn.model_selection.cross_val_score(
    dummy_reg,
    df[wanted_cols],
    df['target'],
    scoring='neg_root_mean_squared_error',
    cv=cv_iterator,
)
print('Dummy, CV scores:', scores * (-1))
print('Dummy, avg CV score:', np.mean(scores) * (-1))

In [None]:
pipeline = sklearn.pipeline.make_pipeline(
    sklearn.preprocessing.StandardScaler(),
    sklearn.linear_model.LinearRegression(),
)
scores = sklearn.model_selection.cross_val_score(
    pipeline,
    df[wanted_cols],
    df['target'],
    scoring='neg_root_mean_squared_error',
    cv=cv_iterator,
)
print('Linreg, CV scores:', scores * (-1))
print('Linreg, avg CV score:', np.mean(scores) * (-1))

Отберу фичи:

In [None]:
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(df[wanted_cols])

linreg = sklearn.linear_model.LinearRegression()
selector = sklearn.feature_selection.RFECV(
    linreg,
    step=1,
    cv=cv_iterator
)
selector.fit(scaler.transform(df[wanted_cols]), df['target'])
selected_cols = [col for col, is_ok in zip(wanted_cols, selector.support_) if is_ok]
print(f'Selected cols ({len(selected_cols)} of {len(wanted_cols)}): {selected_cols}')
print(f'Dropped cols ({len(wanted_cols) - len(selected_cols)} of {len(wanted_cols)}): {[x for x in wanted_cols if x not in selected_cols]}')

С отобранными фичами скор на холдауте:

In [None]:
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(df[~df['fold'].isin(['holdout'])][selected_cols])
linreg = sklearn.linear_model.LinearRegression()
linreg.fit(
    scaler.transform(df[~df['fold'].isin(['holdout'])][selected_cols]),
    df[~df['fold'].isin(['holdout'])]['target'],
)
y_true = df[df['fold'].isin(['holdout'])]['target']
y_pred = linreg.predict(scaler.transform(df[df['fold'].isin(['holdout'])][selected_cols]))
holdout_score = sklearn.metrics.mean_squared_error(y_true, y_pred, squared=False)
print('Holdout score:', holdout_score)

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(range(len(y_pred)), y_pred)
plt.plot(range(len(y_true)), y_true)
plt.show()

In [None]:
with open('/kaggle/working/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
with open('/kaggle/working/linreg.pkl', 'wb') as f:
    pickle.dump(linreg, f)

## Сабмит

In [None]:
print(selected_cols)

In [None]:
submit_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
submit_df.shape

In [None]:
spacynlp_docs = get_spacynlp_docs(submit_df, spacynlp)

###

submit_df['excerpt_avg_glove_count'] = submit_df['excerpt'].progress_apply(lambda x: text_to_avg_word2vec_count(x, glove_counter_dict))

###

depths = []
for doc in tqdm(spacynlp_docs):
    depths.append(get_avg_tree_depth(doc))

submit_df['excerpt_avg_max_tree_depth'] = depths

###

root_distances_from_sentence_end = []
for doc in tqdm(spacynlp_docs):
    root_distances_from_sentence_end.append(get_avg_root_distance_from_sentence_end(doc))

submit_df['excerpt_avg_root_distances_from_sentence_end'] = root_distances_from_sentence_end

###

submit_df['excerpt_perplexity'] = submit_df['excerpt'].progress_apply(
    lambda x: perplexity_sentence(x,  gpt_tokenizer, gpt_model)
)

In [None]:
result = linreg.predict(scaler.transform(submit_df[selected_cols]))

submission_df = pd.DataFrame({'id': submit_df.id, 'target': 0})
submission_df.target = result

submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df