In [None]:
import pandas as pd
import numpy as np
import spacy

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

nlp = spacy.load('en_core_web_sm')

In [None]:
def preprocess_text(text):
    """
    Remove punctuation and lemmatize
    """    
    p_text = ''
    doc = nlp(text)
    for token in doc:
        if token.is_punct==False:
            p_text = p_text + ' ' + token.lemma_
    
    return p_text

In [None]:
# Get dataset with cleaned, simplified text representation eg no puncts, lemmatized
train['clean_text'] = train.excerpt.apply(preprocess_text)

In [None]:
#Split out test before cross validation folds 
X_trainval, X_test, y_trainval, y_test = train_test_split(train.drop(columns = ['target']), train.target, test_size=0.2, random_state=0)

#initialize the vectorizer 
vec = TfidfVectorizer(max_df=0.8,min_df=0.2,max_features=100000,stop_words = {'english'})

#Supply the vocabulary and apply the transforms to create sparse arrays for each text (2 operations in one line of code)
X = vec.fit_transform(X_trainval["clean_text"]).toarray()
Xt = vec.transform(X_test["clean_text"]).toarray()

param_grid = {'alpha': [0.001,0.01,0.1,1,10]}

#Initialize the GridSearch with params to vary and specify the scoring metric to use
grid = GridSearchCV(Ridge(max_iter=1000), param_grid, scoring='neg_mean_squared_error', cv=5)
grid.fit(X, y_trainval)
preds = grid.predict(X)

print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Test score with best model params: {:.3f}".format(grid.score(Xt, y_test)))

preds = grid.predict(Xt)
mean_squared_error(y_test, preds, squared=False)

In [None]:
test['clean_text'] = test.excerpt.apply(preprocess_text)

In [None]:
test_tr = vec.transform(test["clean_text"]).toarray()
preds = grid.predict(test_tr)

In [None]:
sub = pd.DataFrame({'id':test.id, 'target':preds})
sub.to_csv("submission.csv", index=False)

In [None]:
sub