Version update: <br>
Added Word to Vec <br>
Detailed Notebook with visualizations: https://www.kaggle.com/getitdone/commonlit-word-to-vec-with-umap

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

Let's use Pandas Profiling library to build an intuition around train and test data

# **Basic EDA using Pandas Profiling**

In [None]:
import pandas_profiling #pre-installed

In [None]:
train_prof = pandas_profiling.ProfileReport(df=train)
test_prof = pandas_profiling.ProfileReport(df=test)

In [None]:
# train_prof

Notes for train data:
* id, target, standard_error are all unique

In [None]:
# test_prof

Notes for test data:
* only 7 rows are there
* standard_error is not provided


**Understanding "target"**

References:
* https://www.kaggle.com/gunesevitan/commonlit-readability-prize-eda
* https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model
* https://www.kaggle.com/andreshg/commonlit-a-complete-analysis

* Target column is named as target and it is reading ease of the excerpt. The excerpt with *436ce79fe* id is set as baseline for comparisons. That's the reason why its target and standard_error values are 0. 
* Other excerpts are compared with *436ce79fe* and rated by multiple raters based on their ease of read. 
* After that, the excerpts are ranked with Bradley-Terry model. 
* Therefore, every excerpt with target value greater than 0 are easier to read and every excerpt with target value less than 0 are harder to read compared to that particular excerpt. 
* As there were multiple raters, standard_error tells us the measure of spread of scores among the raters for each excerpt.

**Negative Valu**e_______Zero___________**Positive Value** <br>
*Difficult Excerpt*____ Base Line_____*_Easy Excerpt*

In [None]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from scipy import stats

In [None]:
def plot_graphs(df,feature):
    (osm, osr), (slope, intercept, r) = stats.probplot(df[feature], plot=None)
    fig = make_subplots(
    rows=1, cols=2,

    subplot_titles=(
        "Quantile-Quantile Plot of " + feature,
        "Distribution Plot of " + feature
    )
    )


    fig.add_trace(go.Scatter(
    x=osm,
    y=slope*osm + intercept,
    mode='lines',
    line={
        'color': '#c81515',
        'width': 2.5
    }

    ), row=1, col=1)
    fig.add_trace(go.Scatter(
    x=osm,
    y=osr,
    mode='markers',
    marker={
        'color': '#496595'
    }
    ), row=1, col=1)
    fig1 = ff.create_distplot([df[feature]],['target'], 
                         bin_size=.05, show_rug=False)
    mean_value = df[feature].mean()
    median_value = df[feature].median()

    fig.add_trace(go.Scatter(
    fig1['data'][1],
    line=dict(

    width=1.5,
    ),
    fill='tozeroy'
    ),row=1,col=2)
    fig.add_annotation(
    yref="y domain",
    x=mean_value,
    y=0.5,
    axref="x",
    ayref="y domain",
    ax=mean_value + 0.2*mean_value,
    ay=0.1,
    text=f"<span>{feature.capitalize()} mean</span>= {round(mean_value,3)}",
    row=1,col=2)
    fig.add_annotation(
    yref="y domain",
    x=median_value,
    y=0.3,
    axref="x",
    ayref="y domain",
    ax=median_value + 0.2*median_value,
    ay=0.2,
    text=f"<span>{feature.capitalize()} median</span>= {round(median_value,3)}",
    row=1,col=2)
    fig.add_vline(
    x=mean_value, 
    line_width=2, 
    line_dash="dash",row=1,col=2
    )
    fig.add_vline(
    x=median_value, 
    line_width=2,line_dash="dash",line_color='red' ,row=1,col=2

    )

    fig.update_layout(showlegend=False)
    fig.show()

In [None]:
plot_graphs(train,'target')

In [None]:
plot_graphs(train[train['standard_error']!=0],'standard_error')

In [None]:
sns.jointplot(data= train[train['standard_error']!=0],
    x='target', 
    y='standard_error', 
    kind='hex',
    height=8,

)
plt.suptitle("Target vs Standard error ",font="Serif", size=20)
plt.subplots_adjust(top=0.95)
plt.show()

* When standard_error is plotted against target without the baseline excerpt, a relationship can be seen. Excerpts with medium ease of read tend to have less spread of scores, however excerpts at both ends have more spread because they are either too easy or too hard for the raters. 
* Raters' subjective opinions vary a lot when they rate those easy and hard excerpts, but they give closer opinions when the excerpts have medium difficulty.

# Basic Features
* Excerpt Length
* Excerpt Word Count
* Max Length of sentence in excerpt
* Average Length of Sentences

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from statistics import mean

In [None]:
#https://stackoverflow.com/a/55608579
def min_max_mean_sentence_length(text):

    tokened_sent = sent_tokenize(text)
    main_dict = {}
    for item in tokened_sent:
        item1 = list(item.split(" "))
        item2 = [' '.join(item1)]
        Length = []
        Length.append(len(item1))
        mydict = dict(zip(item2, Length))
        main_dict.update(mydict)

    return max(main_dict.values()), min(main_dict.values()), round(mean(main_dict.values()),3)

In [None]:
def basic_features(_):
    df= _.copy()
    df['excerpt_len'] = df['excerpt'].apply(lambda x : len(x))
    df['excerpt_word_count'] = df['excerpt'].apply(lambda x : len(x.split(' ')))
    df[['max_len_sent','min_len_sent','avg_len_sent']] = df.apply(lambda x: min_max_mean_sentence_length(x['excerpt']),axis=1, result_type='expand')
    return df

In [None]:
train = basic_features(train)

In [None]:
train.head()

Plotting Basic Features

In [None]:
def plot_feature(feature):

    fig, axes = plt.subplots(ncols=2, figsize=(32, 6))

    sns.regplot(x=train['target'], y=train[feature], line_kws={'color': 'red'}, ax=axes[0])
    sns.kdeplot(train[feature], fill=True, ax=axes[1])

    axes[0].set_xlabel(f'target', size=18)
    axes[0].set_ylabel(feature, size=18)
    axes[1].set_xlabel('')
    axes[1].set_ylabel('')
    axes[1].legend(prop={'size': 15})
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=15)
        axes[i].tick_params(axis='y', labelsize=15)
    axes[0].set_title(f'target vs {feature}', size=20, pad=20)
    axes[1].set_title(f'{feature} Distribution', size=20, pad=20)

    plt.show()

In [None]:
for feature in ['excerpt_len', 'excerpt_word_count', 'min_len_sent', 'max_len_sent', 'avg_len_sent']:
    plot_feature(feature)

* Strong negative relationships can be seen from the scatter plots of max_sent_len, avg_sent_len.
* Other features don't look very promising from their skewed distributions and weak relationships, but they can be still useful in terms of predictive power.

# Model on Basic Features 

In [None]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='target'), train['target'].values, random_state=42,test_size=0.20)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

In [None]:
features = ['excerpt_len', 'excerpt_word_count', 'min_len_sent', 'max_len_sent', 'avg_len_sent']

In [None]:
from sklearn.metrics import mean_squared_error

**Baseline using mean value of target**

In [None]:
pred_y = [train['target'].mean()] * len(y_test)
print(f' Test RMSE when we fill predictions with mean value of target in train data is {round(np.sqrt(mean_squared_error(y_test,pred_y)),4)}')

**LightGBM model**

In [None]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor(random_state=42)
gbm.fit(X_train[features],y_train,eval_metric='mse')
pred_y = gbm.predict(X_test[features])

In [None]:
print(f' Test RMSE using basic features {round(np.sqrt(mean_squared_error(y_test,pred_y)),4)}')

* ~14% improvement from baseline model

In [None]:
test = basic_features(test)
pred_y = gbm.predict(test[features])

In [None]:
def create_submission(_,predictions):
    df =_.copy()
    df['target'] = predictions
    return df[['id','target']]

In [None]:
# submission = create_submission(test,pred_y)
# submission.to_csv('./submission.csv',index=False)

# Readability Scores

1. [The Flesch Reading Ease formula](https://www.kaggle.com/prvnkmr/domain-knowledge-readability-score-methods?scriptVersionId=62948009&cellId=10) <br>
RE = 206.835 – (1.015 x ASL) – (84.6 x ASW)<br>
*RE = Readability Ease<br>
ASL = Average Sentence Length (i.e., the number of words divided by the number of sentences) <br>
ASW = Average number of syllables per word (i.e., the number of syllables divided by the number of words) <br>
The output, i.e., RE is a number ranging from 0 to 100. The higher the number, the easier the text is to read.*



2. [The Flesch-Kincaid Grade Level Readability Formula](https://www.kaggle.com/prvnkmr/domain-knowledge-readability-score-methods?scriptVersionId=62948009&cellId=11) <br>
FKRA = (0.39 x ASL) + (11.8 x ASW) - 15.59<br>
*FKRA = Flesch-Kincaid Reading Age<br>
ASL = Average Sentence Length (i.e., the number of words divided by the number of sentences) <br>
ASW = Average number of syllables per word (i.e., the number of syllables divided by the number of words) <br>*




3. [The fog scale](https://www.kaggle.com/prvnkmr/domain-knowledge-readability-score-methods?scriptVersionId=62948009&cellId=12) <br>
Grade Level = 0.4 (ASL + PHW)<br>
*ASL = Average Sentence Length (i.e., number of words divided by the number of sentences)<br>
PHW = Percentage of Hard Words*

In [None]:
#https://www.kaggle.com/duboisian/first-draft-model?scriptVersionId=63553418&cellId=2
def GrunningFog(excerpt):
    """
    function takes a passage and determines the grade level based on the Grunning Fog index method
    """
    document = excerpt
    document = document.replace('\n',' ').split('.')
    document = [x for x in document if len(x)>1]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = []
    ComplexCount = []
    for sentence in document:
        tokens = nltk.word_tokenize(sentence)
        words.append(len(tokens))
        tokens = [lemmatizer.lemmatize(x) for x in tokens]
        Complex = [1 if syllable_count(token) >=3 else 0 for token in tokens]
        ComplexCount.append(np.sum(Complex))
    ASL = np.mean(words) #Average words per sentence
    PropComplex = np.sum(ComplexCount)/np.sum(words) #proprtion of complex words (>= 3 sylables)
    GrunFog = 0.4*(ASL + (100*PropComplex))
    return(GrunFog)

3. [The SMOG Index](https://www.geeksforgeeks.org/readability-index-pythonnlp/3) <br>

In [None]:
def SMOG(excerpt):
    document = excerpt
    document = document.replace('\n',' ').split('.')
    document = [x for x in document if len(x)>1]
    words = []
    ComplexCount = []
    for sentence in document:
        tokens = nltk.word_tokenize(sentence)
        words.append(len(tokens))
        Complex = [1 if syllable_count(token) >=3 else 0 for token in tokens]
        ComplexCount.append(np.sum(Complex))
    SMOGScore = (1.0430 * np.sqrt(np.sum(ComplexCount) * (30/len(words)))) + 3.1291
    return(SMOGScore)

In [None]:
#https://stackoverflow.com/a/46759549
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [None]:
def asw_asl(_):
    df = _.copy()
    df['ASL'] = df['excerpt'].apply(lambda row: np.sum([len(x.split(' ')) for x in row.replace('\n','').split('.')])/len([len(x.split(' ')) for x in row.replace('\n','').split('.')]))
    df['ASW'] = df['excerpt'].apply(lambda row: np.sum([syllable_count(x) if len(x)>0 else 0 for x in row.replace('\n','').replace('.','').split(' ')])/len([x for x in row.replace('\n','').replace('.','').split(' ')]))
    df['RE'] = df.apply(lambda row: 206.835 - (1.015 * row['ASL']) - (84.6 * row['ASW']),axis = 1)
    df['FKRA'] = df.apply(lambda row: (0.39 * row['ASL']) + (11.8 * row['ASW']) -15.59 ,axis = 1)
    df['GrunFog'] = df['excerpt'].apply(lambda row: GrunningFog(row))
    df['SMOG'] = df['excerpt'].apply(lambda row: SMOG(row))
    return df

In [None]:
train = asw_asl(train)

In [None]:
for feature in ['RE','FKRA','GrunFog','SMOG']:
    plot_feature(feature)

# Model on Basic Features + Readability Scores

In [None]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='target'), train['target'].values, random_state=42,test_size=0.20)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

In [None]:
features = ['excerpt_len', 'excerpt_word_count', 'min_len_sent', 'max_len_sent', 'avg_len_sent','ASL',
 'ASW',
 'RE',
 'FKRA',
 'GrunFog',
 'SMOG']

In [None]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor(random_state=42)
gbm.fit(X_train[features],y_train,eval_metric='mse')
pred_y = gbm.predict(X_test[features])

In [None]:
print(f' Test RMSE using basic features {round(np.sqrt(mean_squared_error(y_test,pred_y)),4)}')

* ~19.6% improvement from baseline model

In [None]:
test = asw_asl(test)

In [None]:
test = basic_features(test)
pred_y = gbm.predict(test[features])

In [None]:
# submission = create_submission(test,pred_y)
# submission.to_csv('./submission.csv',index=False)

# Word 2 Vec

In [None]:
import spacy
from tqdm.notebook import tqdm
nlp = spacy.load('en_core_web_lg')

In [None]:
import re
def clean_text(text):
    text= text.lower() # make text lowercase
    text = text.replace("\n"," ") #remove \n from text
#     text = re.sub('[^A-Za-z0-9., ], ' ', text)
    return text

In [None]:
train['excerpt'] = train['excerpt'].apply(lambda x: clean_text(x))
test['excerpt'] = test['excerpt'].apply(lambda x: clean_text(x))

In [None]:
X_train = np.vstack([nlp(text).vector for text in tqdm(train['excerpt'])])
y_train = train['target']
print(f'Shape of Train vectors: {X_train.shape}')

In [None]:
X_test = np.vstack([nlp(text).vector for text in tqdm(test['excerpt'])])
print(f'Shape of Test vectors: {X_test.shape}')

# Model on Basic Features + Readability Scores + Word 2 Vec

In [None]:
from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_train_df = pd.concat([pd.DataFrame(X_train),train[features]],axis=1)

In [None]:
X_test_df = pd.concat([pd.DataFrame(X_test),test[features]],axis=1)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_df, train['target'], test_size=0.2, random_state=42)

In [None]:
for i in [1e-5,1e-4,1e-3,1e-2,1e-1,1,10,100]:
    print(f' aplha {i}')
    regressor = Ridge(alpha=i,fit_intercept=True, normalize=True)
    regressor.fit(X_train,y_train)
    print(f'Train Root mean squared error: {mean_squared_error(y_train,regressor.predict(X_train),squared=False)}')
    print(f'Validation Root mean squared error: {mean_squared_error(y_val,regressor.predict(X_val),squared=False)}')

In [None]:
regressor = Ridge(alpha=0.1,fit_intercept=True, normalize=False) #aplha =0.1
regressor.fit(X_train, y_train) 
test['target'] = regressor.predict(X_test_df)
test[['id','target']].to_csv('./submission.csv', index=False)