In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
import spacy
from tqdm import tqdm

## LOADING

In [None]:
ROOT_DIR = "../input/commonlitreadabilityprize"

## PROCESSING

In [None]:
tr = pd.read_csv(f"{ROOT_DIR}/train.csv")
te = pd.read_csv(f"{ROOT_DIR}/test.csv")

In [None]:
"""
#vec = TfidfVectorizer() #26833
vec = TfidfVectorizer(max_features=10_000)
X = vec.fit_transform(tr["excerpt"]).toarray()
Xe = vec.transform(te["excerpt"]).toarray()
"""

In [None]:
nlp = spacy.load('en_core_web_lg')
with nlp.disable_pipes():
    X = np.array([nlp(text).vector for text in tqdm(tr.excerpt)])
    #Xe = np.array([nlp(text).vector for text in te.excerpt])

In [None]:
with nlp.disable_pipes():
    Xe = np.array([nlp(text).vector for text in tqdm(te.excerpt)])

In [None]:
y = tr["target"].values

In [None]:
X.shape, Xe.shape

## TRAINING AND PREDICTION

In [None]:
NFOLDS = 10
skf = KFold(n_splits=NFOLDS)
folds = list(skf.split(X))

In [None]:
oof = np.zeros(y.shape)
pred = np.zeros((Xe.shape[0],))
for idx in range(NFOLDS):
    print("FOLD: ", idx + 1)
    tr_idx, val_idx = folds[idx]
    #reg = Ridge()
    reg = LinearRegression()
    reg.fit(X[tr_idx], y[tr_idx])
    oof[val_idx] =reg.predict(X[val_idx])
    pred += reg.predict(Xe) / NFOLDS
#===============

In [None]:
mse = mean_squared_error(y, oof, squared=False)
print("OOF MSE:",mse)

## SUBMISSION

In [None]:
sub = te[["id"]].copy()
sub["target"] = pred

In [None]:
sub.head()

In [None]:
sub.to_csv("submission.csv", index=False)