**Some of the key resources that helped me out in formulating this:**
- https://www.kaggle.com/ejmejm/commonlit-eda-video-tutorial?scriptVersionId=64939768 : There is a beautiful explanation via youtube video as well. 
- https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline

In [None]:
!pip install textstat
import numpy as np
import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse
import os
import textstat

import nltk
from nltk.corpus import stopwords
from nltk import pos_tag

In [None]:
data_dir = '/kaggle/input/commonlitreadabilityprize'
train_data_path = os.path.join(data_dir, 'train.csv')
test_data_path = os.path.join(data_dir, 'test.csv')

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.head()

### EDA

In [None]:
train_df.info()

- **Input Variable:** *excerpt*
- **Target Variable:** *target*

**Unique Characters**

In [None]:
all_chars = set()
train_df['excerpt'].apply(lambda x: [all_chars.add(c) for c in x])
for c in sorted(all_chars):
    print(c + ' ', end='')
    
print('\n\n')
    
for c in sorted(all_chars):
    print(f'({c}, {str(ord(c))}) ', end='')

In [None]:
# Make a boolean column for excerpts with "hard characters"

hard_chars = set()
for c in all_chars:
    if ord(c) >= 176 and ord(c) <= 339:
        hard_chars.add(c)
        
print(hard_chars)

train_df['has_hard_char'] = train_df['excerpt'].apply(lambda x: any([c in hard_chars for c in x]))
sum(train_df['has_hard_char'])

**Excerpt len variation**

In [None]:
train_df['len_excerpt'] = train_df['excerpt'].apply(len)

In [None]:
sns.distplot(train_df['len_excerpt'], kde=False)

**Target Column Distribution**

In [None]:
sns.distplot(train_df['target'], kde=False)

**Target Correlation with "Hard Characters"**

In [None]:
sns.violinplot(train_df['has_hard_char'], train_df['target'], palette=['b', 'r'])

**Target Correlation with Excerpt Length**

In [None]:
correlation_matrix = np.corrcoef(train_df['len_excerpt'], train_df['target'])
correlation_xy = correlation_matrix[0,1]
r_squared = correlation_xy**2

print('Linear fit r^2:', r_squared)

In [None]:
correlation_matrix[0,1]
correlation_matrix

**Target Correlation with Standard Error**

In [None]:
sns.scatterplot(train_df['target'], train_df['standard_error'], alpha=0.4)

**Finding the outlier**

In [None]:
# removing the values with zero as target
train_df = train_df[train_df['target'] != 0]

In [None]:
train_df.shape

**Extracting Top n-grams (1, 2, 3)**

In [None]:
# preprocess text
def preprocess(data):
    excerpt_processed=[]
    for e in data['excerpt']:
        
        # find alphabets
        e = re.sub("[^a-zA-Z]", " ", e)
        
        # convert to lower case
        e = e.lower()
        
        # tokenize words
        e = nltk.word_tokenize(e)
        
        # remove stopwords
        e = [word for word in e if not word in set(stopwords.words("english"))]
        
        # lemmatization
        lemma = nltk.WordNetLemmatizer()
        e = [lemma.lemmatize(word) for word in e]
        e=" ".join(e)
        
        excerpt_processed.append(e)
        
    return excerpt_processed 

train_df["excerpt_preprocessed"] = preprocess(train_df)
test_df["excerpt_preprocessed"] = preprocess(test_df)

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CV().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram(corpus, n=None):
    vec = CV(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_trigram(corpus, n=None):
    vec = CV(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def plot_bt(x,w,p):
    common_words = x(train_df['excerpt_preprocessed'], 20)
    common_words_df = pd.DataFrame(common_words,columns=['word','freq'])
    plt.figure(figsize=(16,8))
    sns.barplot(x='freq', y='word', data=common_words_df,facecolor=(0, 0, 0, 0),linewidth=3,edgecolor=sns.color_palette(p,20))
    plt.title("Top 20 "+ w,font='Serif')
    plt.xlabel("Frequency", fontsize=14)
    plt.yticks(fontsize=13)
    plt.xticks(rotation=45, fontsize=13)
    plt.ylabel("");
    return common_words_df

In [None]:
common_words = get_top_n_words(train_df['excerpt_preprocessed'], 20)
common_words_df1 = pd.DataFrame(common_words,columns=['word','freq'])
plt.figure(figsize=(16, 8))
ax = sns.barplot(x='freq', y='word', data=common_words_df1,facecolor=(0, 0, 0, 0),linewidth=3,edgecolor=sns.color_palette("ch:start=3, rot=.1",20))

plt.title("Top 20 unigrams",font='Serif')
plt.xlabel("Frequency", fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(rotation=45, fontsize=13)
plt.ylabel("");

common_words_df2 = plot_bt(get_top_n_bigram,"bigrams","ch:rot=-.5")
common_words_df3 = plot_bt(get_top_n_trigram,"trigrams","ch:start=-1, rot=-.6")


### Feature Engineering

In [None]:
train_df.reset_index(inplace=True, drop=True)
text_props = train_df.copy()

def avg_word_len(df):
    df = df.str.split().apply(lambda x : [len(i) for i in x]).map(lambda x: np.mean(x))
    return df

text_len = train_df['excerpt'].str.len()
text_len_pre = train_df['excerpt_preprocessed'].str.len()
avg_text = avg_word_len(train_df['excerpt'])
avg_text_pre = avg_word_len(train_df['excerpt_preprocessed'])
lexicon_count = []
lexicon_count_pre = []
sentence_count = []
for i in range(len(train_df)):
    lc = textstat.lexicon_count(train_df['excerpt'][i])
    lcp = textstat.lexicon_count(train_df['excerpt_preprocessed'][i])
    sc = textstat.sentence_count(train_df['excerpt'][i])
    lexicon_count.append(lc)
    lexicon_count_pre.append(lcp)
    sentence_count.append(sc)
    
text_props['text_len'] = text_len
text_props['text_len_pre'] = text_len_pre
text_props['lexicon_count'] = lexicon_count
text_props['lexicon_count_pre'] = lexicon_count_pre
text_props['avg_text'] = avg_text
text_props['avg_text_pre'] = avg_text_pre
text_props['sentence_count'] = sentence_count

In [None]:
text_len = test_df['excerpt'].str.len()
text_len_pre = test_df['excerpt_preprocessed'].str.len()
avg_text = avg_word_len(test_df['excerpt'])
avg_text_pre = avg_word_len(test_df['excerpt_preprocessed'])
lexicon_count = []
lexicon_count_pre = []
sentence_count = []
for i in range(len(test_df)):
    lc = textstat.lexicon_count(test_df['excerpt'][i])
    lcp = textstat.lexicon_count(test_df['excerpt_preprocessed'][i])
    sc = textstat.sentence_count(test_df['excerpt'][i])
    lexicon_count.append(lc)
    lexicon_count_pre.append(lcp)
    sentence_count.append(sc)
    
test_df['text_len'] = text_len
test_df['text_len_pre'] = text_len_pre
test_df['lexicon_count'] = lexicon_count
test_df['lexicon_count_pre'] = lexicon_count_pre
test_df['avg_text'] = avg_text
test_df['avg_text_pre'] = avg_text_pre
test_df['sentence_count'] = sentence_count

**Correlation between features**

In [None]:
num_cols = ['text_len','text_len_pre','lexicon_count','lexicon_count_pre','avg_text','avg_text_pre','sentence_count','target']
corr = text_props[num_cols].corr().abs()

fig = plt.figure(figsize=(12,12),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5, annot=True)
plt.title('Correlation of text properties', fontsize=15,font="Serif")
plt.show()

**avg_text_pre is highly correlated to target**

In [None]:
filtered_cols = []
filtered_cols.append("avg_text_pre")

**Part-of-Speech tagging**

In [None]:
text_props['pos_tags'] = text_props['excerpt_preprocessed'].str.split().map(pos_tag)

def count_tags(pos_tags):
    tag_count = {}
    for word,tag in pos_tags:
        if tag in tag_count:
            tag_count[tag] += 1
        else:
            tag_count[tag] = 1
    return tag_count

text_props['tag_counts'] = text_props['pos_tags'].map(count_tags)

set_pos = set([tag for tags in text_props['tag_counts'] for tag in tags])
tag_cols = list(set_pos)

for tag in tag_cols:
    text_props[tag] = text_props['tag_counts'].map(lambda x: x.get(tag, 0))

In [None]:
test_df['pos_tags'] = test_df['excerpt_preprocessed'].str.split().map(pos_tag)

def count_tags(pos_tags):
    tag_count = {}
    for word,tag in pos_tags:
        if tag in tag_count:
            tag_count[tag] += 1
        else:
            tag_count[tag] = 1
    return tag_count

test_df['tag_counts'] = test_df['pos_tags'].map(count_tags)

for tag in tag_cols:
    test_df[tag] = test_df['tag_counts'].map(lambda x: x.get(tag, 0))

In [None]:
text_props[tag_cols]

In [None]:
corr_tags = text_props[tag_cols + ['target']].corr().abs()

fig = plt.figure(figsize=(30,24),dpi=80)
mask_tags = np.triu(np.ones_like(corr_tags, dtype=bool))
sns.heatmap(corr_tags, mask=mask_tags, cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5, annot=True)
plt.title('Correlation of POS tags', fontsize=15,font="Serif")
plt.show()

# sentence_count is highly correlated to target 

**VBD, NN, VB, JJ are the most correlated ones with target** <br>
**Tags REF: https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html**

In [None]:
filtered_cols += "VBD", "NN", "VB","JJ"
filtered_cols

In [None]:
# POS tags frequency across the text
pos = text_props[tag_cols].sum().sort_values(ascending = False)
plt.figure(figsize=(16,10))
ax = sns.barplot(x=pos.index, y=pos.values,palette="Wistia")
plt.xticks(rotation = 50)
ax.set_yscale('log')
plt.title('POS tags frequency',fontsize=15,font="Serif")
plt.show()

**Common Readability Tests**

In [None]:
flesch_re, flesch_kg, fog_scale, automated_r,coleman, linsear, text_standard  = ([] for i in range(7))
for i in range(len(text_props)):
    flr = textstat.flesch_reading_ease(train_df['excerpt'][i])
    flkg = textstat.flesch_kincaid_grade(train_df['excerpt'][i])
    fs = textstat.gunning_fog(train_df['excerpt'][i])
    ar = textstat.automated_readability_index(train_df['excerpt'][i])
    cole = textstat.coleman_liau_index(train_df['excerpt'][i])
    lins = textstat.linsear_write_formula(train_df['excerpt'][i])
    ts = textstat.text_standard(train_df['excerpt'][i])
    
    flesch_re.append(flr)
    flesch_kg.append(flkg)
    fog_scale.append(fs)
    automated_r.append(ar)
    coleman.append(cole)
    linsear.append(lins)
    text_standard.append(ts)
    
text_props['flesch_re'] = flesch_re
text_props['flesch_kg'] = flesch_kg
text_props['fog_scale'] = fog_scale
text_props['automated_r'] = automated_r
text_props['coleman'] = coleman
text_props['linsear'] = linsear
text_props['text_standard'] = text_standard

In [None]:
flesch_re = []
for i in range(len(test_df)):
    flr = textstat.flesch_reading_ease(test_df['excerpt'][i])
    flesch_re.append(flr)
    
test_df['flesch_re'] = flesch_re

In [None]:
readability_cols = ['flesch_re','flesch_kg','fog_scale','automated_r','coleman','linsear','text_standard','target']

corr_read = text_props[readability_cols].corr().abs()
fig = plt.figure(figsize=(12,12),dpi=80)
mask_read = np.triu(np.ones_like(corr_read, dtype=bool))
sns.heatmap(corr_read, mask=mask_read, cmap='PuBuGn', robust=True, center=0,
            square=True, linewidths=.5,annot=True)
plt.title('Correlation of readability tests', fontsize=15,font="Serif")
plt.show()

**flesch_re is highly correlated to target**

In [None]:
filtered_cols.append("flesch_re") 
filtered_cols

In [None]:
plt.figure(figsize=(10,8))
sns.kdeplot(text_props["flesch_re"],shade=True)
plt.title("Distribution of Flesch Reading Ease test")
plt.show()

In [None]:
# More than 70% of excerpts can be easily understood by 13-15 year olds.
text_props.loc[text_props['flesch_re'] > 60]['flesch_re'].count() / len(text_props) *100

### Feature Selection

In [None]:
filtered_cols += "target", "excerpt_preprocessed", "excerpt"

In [None]:
final_train_df = text_props.loc[:,filtered_cols]
filtered_cols.remove("target")
final_test_df = test_df.loc[:, filtered_cols]