In [None]:
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error as mse
import seaborn as sns

In [None]:
TRAIN_PATH = '../input/commonlitreadabilityprize/train.csv'
TEST_PATH = '../input/commonlitreadabilityprize/test.csv'
# In case you want to show full sentences.
# pd.set_option('display.max_colwidth', None)

In [None]:
data = pd.read_csv(TRAIN_PATH)
data.head()

In [None]:
data.describe()

In [None]:
data = data.sort_values(by=['target'])

In [None]:
def word_count(text):
    return len(text.split(' '))
def long_words(text, length):
    return len([w for w in text.split(' ') if len(w) >= length])

data['count'] = data['excerpt'].apply(word_count)
data['len'] = data['excerpt'].apply(len)
data['word7'] = data['excerpt'].apply(lambda t: long_words(t, 7))
data['word10'] = data['excerpt'].apply(lambda t: long_words(t, 10))
data['word13'] = data['excerpt'].apply(lambda t: long_words(t, 13))

In [None]:
sns.displot(data, x='target')
sns.displot(data, x='standard_error')

In [None]:
data[data['standard_error'] < 0.4]

In [None]:
# Maybe error, delete
data = data[data['standard_error'] > 0.4]
sns.displot(data, x='standard_error')

In [None]:
data[data['standard_error'] > 0.63]

In [None]:
sns.lmplot(x='count', y='target', data=data)
sns.lmplot(x='len', y='target', data=data)
sns.lmplot(x='word7', y='target', data=data)
sns.lmplot(x='word10', y='target', data=data)
sns.lmplot(x='word13', y='target', data=data)
sns.lmplot(x='target', y='standard_error', data=data)

In [None]:
stopwords_en = set(stopwords.words('english'))
lemma = nltk.WordNetLemmatizer()

def tokenize(text):
    text = re.sub(r'[^a-z]', ' ', text.lower())
    words = nltk.word_tokenize(text)
    words = [lemma.lemmatize(w) for w in words if not w in stopwords_en]
    return words

def normalize(text):
    return ' '.join(tokenize(text))

data['tokens'] = data['excerpt'].apply(tokenize)
data['normalized'] = data['excerpt'].apply(normalize)

In [None]:
data.head()

In [None]:
fig = 1

def show_word_cloud(corpus):
    global fig
    wc = WordCloud(stopwords=STOPWORDS, width=1000, height=600, max_words=150)
    wc.generate(' '.join(corpus['normalized']))
    plt.figure(fig)
    fig += 1
    plt.imshow(wc, interpolation='bilinear')

show_word_cloud(data[:500])
show_word_cloud(data[500:1000])
show_word_cloud(data[1000:1500])
show_word_cloud(data[1500:2000])
show_word_cloud(data[2000:2500])

plt.show()

In [None]:
test = pd.read_csv(TEST_PATH)

def train(model, model_name, X_train, y_train, X_test, y_test, eval_df):
    t1 = time.time()
    model = make_pipeline(
        TfidfVectorizer(binary=True, ngram_range=(1, 1)),
        model,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    MSE = mse(y_test, y_pred)
    t2 = time.time()
    training_time = t2 - t1
    
    print('--- Model: ', model_name, '---')
    print('MSE: ', MSE, '\t\t\t', 'Training time: ', training_time, '\n')
    
    submission = eval_df[['id']]
    submission['target'] = model.predict(eval_df['excerpt'])
    print(submission)
    submission.to_csv(f'/kaggle/working/{model_name}_submission.csv')
    if model_name == 'RidgeRegression':
        submission.to_csv(f'/kaggle/working/submission.csv')
    
ridge = Ridge(fit_intercept=True, normalize=False)
lr = LinearRegression()
models = {
    'RidgeRegression': ridge,
    'LinearRegression': lr,
}
X = data['excerpt']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

for model_name, model in models.items():
    train(
        model_name=model_name, model=model,
        X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
        eval_df=test,
    )