In [None]:
import numpy as np
import pandas as pd
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
wn = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
SIMILARITY_COUNT = 15

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_df.head()

In [None]:
def tokenize_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('[^a-z]+', text)
    return tokens

def clean_text(tokens):
    text = [wn.lemmatize(word) for word in tokens if not (word in stopwords or str.isspace(word) or len(word) == 0)]
    return text

In [None]:
train_df['tokens'] = train_df['excerpt'].apply(lambda txt: tokenize_text(txt))
train_df[['tokens']]

In [None]:
test_df['tokens'] = test_df['excerpt'].apply(lambda txt: tokenize_text(txt))
test_df[['tokens']]

In [None]:
%%time

tfidf_vect = TfidfVectorizer(analyzer = clean_text, dtype = np.float32)
tfidf_vect.fit(train_df['tokens'])

In [None]:
tfidf_vect_columns = ['tf_' + colname for colname in tfidf_vect.get_feature_names()]
tfidf_vect_columns[::2000]

In [None]:
train_tf_df = pd.DataFrame(tfidf_vect.transform(train_df['tokens']).toarray(), columns = tfidf_vect_columns)
train_tf_df.sample(5)

In [None]:
test_tf_df = pd.DataFrame(tfidf_vect.transform(test_df['tokens']).toarray(), columns = tfidf_vect_columns)
test_tf_df.sample(5)

In [None]:
cs = cosine_similarity(train_tf_df, test_tf_df)
cs.shape

In [None]:
test_df["target"] = 0.0
test_df.head()

In [None]:
for i in range(len(test_df)):
    similarities = np.argsort(cs[:, i])[::-1]
    test_df.loc[i, "target"] = train_df.iloc[similarities[:SIMILARITY_COUNT]]['target'].mean()
    
test_df.head()

In [None]:
submission_df = test_df[["id", "target"]]
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index = False)