Notebook References:
1. Inspiration to use Spacy: https://www.kaggle.com/vigneshbaskaran/commonlit-spacy-with-ridge-regression
2. Inpiration to use Umap: https://www.kaggle.com/subinium/commonlit-how-to-visualize-text-dataset <br>
**Please upvote if you find this useful, it helps in keeping the motivation levels high**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Create Vectors

In [None]:
import spacy
from tqdm.notebook import tqdm
nlp = spacy.load('en_core_web_lg')

In [None]:
import re
def clean_text(text):
    text= text.lower() # make text lowercase
    text = text.replace("\n"," ") #remove \n from text
#     text = re.sub('[^A-Za-z0-9., ], ' ', text)
    return text

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train['excerpt'] = train['excerpt'].apply(lambda x: clean_text(x))
test['excerpt'] = test['excerpt'].apply(lambda x: clean_text(x))

In [None]:
#example of vstack
a = np.array([1, 2, 3])
b = np.array([2, 3, 4])
np.vstack((a,b))

In [None]:
#nlp(text).vector returns  average of the token vectors as default
#https://spacy.io/api/doc#vector
print(nlp("this is").vector[0])
print((nlp("this").vector[0] + nlp(" ").vector[0]  + nlp("is").vector[0] )/2)

In [None]:
X_train = np.vstack([nlp(text).vector for text in tqdm(train['excerpt'])])
y_train = train['target']
print(f'Shape of Train vectors: {X_train.shape}')

In [None]:
X_test = np.vstack([nlp(text).vector for text in tqdm(test['excerpt'])])
print(f'Shape of Test vectors: {X_test.shape}')

# Visualize Data
How to read UMAP: https://pair-code.github.io/understanding-umap/

In [None]:
from umap import UMAP
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.DataFrame(X_train)
df.index= train.index
df['target'] = train['target']

In [None]:
umap = UMAP(n_neighbors=20,random_state=0)
dr = umap.fit_transform(df, df['target'])

In [None]:
target = df['target']
fig = plt.figure(figsize=(15, 10))
gs = fig.add_gridspec(4, 6)
ax = fig.add_subplot(gs[:,:4])
ax.axis('off')

ax.scatter(x=dr[:,0], y=dr[:,1], s=10, c=target)
ax.set_title('Word 2 Vec Output', loc='left', fontsize=20, fontweight='bold')

ax_dist = fig.add_subplot(gs[:2,4:])
ax_dist.set_title('Target Distribution', loc='left', fontsize=15, fontweight='bold')

sns.kdeplot(target, fill=True, alpha=0, linewidth=0, ax=ax_dist)
path = ax_dist.collections[0].get_paths()[0]
patch = mpl.patches.PathPatch(path, transform=ax_dist.transData)

x = np.linspace(0, 1, 200)


im = ax_dist.imshow(np.vstack([x, x]), 
               cmap="viridis",
               aspect="auto",
               extent=[*ax_dist.get_xlim(), *ax_dist.get_ylim()]
              )

im.set_clip_path(patch)

qtile = target.quantile([0, .25, .5, .75, 1.])

for idx in range(4):
    sub_ax = fig.add_subplot(gs[2+idx//2,4+idx%2])
    sub_ax.axis('off')
    q_range = (target < qtile.iloc[idx+1]) & (target >= qtile.iloc[idx])
    sub_ax.scatter(dr[:,0][q_range],
                   dr[:,1][q_range],
                   s=10, 
                   c=(target[q_range]-qtile.iloc[0])/(qtile.iloc[-1]-qtile.iloc[0]), 
                   vmin=0, vmax=1
                  )
    sub_ax.set_title(f'Q{idx}', loc='left')

fig.tight_layout()
plt.show()

Patterns in data can be observed as difficult excerpts with dark blue color are very much separated from easy excerpts with yellowish colors

# Model

In [None]:
from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, train['target'], test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
for i in [1e-5,1e-4,1e-3,1e-2,1e-1,1,10,100]:
    print(f' aplha {i}')
    regressor = Ridge(alpha=i,fit_intercept=True, normalize=False)
    regressor.fit(X_train,y_train)
    print(f'Train Root mean squared error: {mean_squared_error(y_train,regressor.predict(X_train),squared=False)}')
    print(f'Validation Root mean squared error: {mean_squared_error(y_val,regressor.predict(X_val),squared=False)}')

In [None]:
regressor = Ridge(alpha=1,fit_intercept=True, normalize=False) #aplha =1
regressor.fit(X_train, y_train) 
test['target'] = regressor.predict(X_test)
test[['id','target']].to_csv('./submission.csv', index=False)