# Load library

In [None]:
# warning
import warnings
warnings.filterwarnings('ignore')

# for NLP
import spacy
from spacy.lang.en import English
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import en_core_web_sm
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary

# for data processing
import pandas as pd
import numpy as np

# for statistics
import statistics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import collections

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import umap

# for network analysis
import networkx as nx

# for modeling
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn import metrics

# Load dataset

In [None]:
train = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
train['flag'] = 'train'
test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
test['flag'] = 'test'
alldata = pd.concat([train,test], axis=0, ignore_index=True)

# Explore URL and License
* This dataset has source URL and License in about 800 observations.
* First of all, I explored effects of URL and License on target.

In [None]:
alldata['url_YN'] = [0 if pd.isna(i) else 1 for i in alldata['url_legal']]
alldata['license_YN'] = [0 if pd.isna(i) else 1 for i in alldata['license']]
alldata['url_domain'] = ["" if pd.isna(s) else s.split('/')[2] for s in alldata['url_legal']]

## License

In [None]:
CC_3 = ['CC BY 3.0', 'CC BY-NC 3.0', 'CC BY-NC-SA 3.0 ', 'CC BY-SA 3.0', 'CC BY-SA 3.0 and GFD']
CC_4 = ['CC BY 4.0', 'CC BY-NC-ND 4.0', 'CC BY-NC-SA 4.0']

license_cat = []

for cat in alldata['license']:
    if cat in CC_3:
        license_cat.append('CC_3')
    elif cat in CC_4:
        license_cat.append('CC_4')
    elif pd.isna(cat):
        license_cat.append('nan')
    else:
        license_cat.append('others')

alldata['license_cat'] = license_cat
        
print(alldata.groupby('license').count()['id'])
print(alldata.groupby('license_cat').count()['id'])

## URL

In [None]:
wiki = ['en.wikibooks.org', 'en.wikipedia.org']
simple = ['simple.wikipedia.org']
african = ['www.africanstorybook.org']
kids = ['kids.frontiersin.org']
lit = ['www.commonlit.org']

url_cat = []

for cat in alldata['url_domain']:
    if cat in wiki:
        url_cat.append('wiki')
    elif cat in simple:
        url_cat.append('simple')
    elif cat in african:
        url_cat.append('african')
    elif cat in kids:
        url_cat.append('kids')
    elif cat in lit:
        url_cat.append('lit')
    elif cat=='':
        url_cat.append('nan')
    else:
        url_cat.append('others')
        
alldata['url_cat'] = url_cat

print(alldata.groupby('url_domain').count()['id'])
print(alldata.groupby('url_cat').count()['id'])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))
sns.boxplot(x=alldata['license_cat'],y=alldata['target'], ax=ax1)
sns.boxplot(x=alldata['url_cat'],y=alldata['target'], ax=ax2)

**The distributions of target score in different license/URL differ. This may effect on target score, but observation which has URL and License is very few.**

# Basic statistics
* Explore the effect of sentense length and its statistics.

In [None]:
# sentence count and words in sentence count
nlp = English()
sbd = nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)
nlp = spacy.load("en_core_web_sm")

In [None]:
sent_list = []
num_sent_list = []
num_word_list = []
num_word_stats_list = []

remove_list = [',', '.', ':', ';', '!', '?', '\n', '-', '=', '/', '#', '$', '(', ')']

for doc in alldata['excerpt']:
    doc = nlp(doc)
    sent_temp = [s for s in doc.sents]
    sent_list.append(sent_temp)
    num_sent_list.append(len(sent_temp))
    
    num_word_temp = []
    word_list = []
    for s in sent_temp:
        text_temp = [token.text for token in s]
        word_list = [w for w in text_temp if w not in remove_list]
        num_word_temp.append(len(word_list))
        
    num_word_stats_list.append([min(num_word_temp), max(num_word_temp), statistics.mean(num_word_temp), statistics.median(num_word_temp), statistics.stdev(num_word_temp)])
    num_word_list.append(num_word_temp)

In [None]:
df_statistics = pd.DataFrame({'num_sentence':num_sent_list,
                              'num_word_min':[x[0] for x in num_word_stats_list],
                              'num_word_max':[x[1] for x in num_word_stats_list],
                              'num_word_mean':[x[2] for x in num_word_stats_list],
                              'num_word_median':[x[3] for x in num_word_stats_list],
                              'num_word_stdev':[x[4] for x in num_word_stats_list]})

In [None]:
statistics_col = df_statistics.columns
fig,axes = plt.subplots(nrows=3,ncols=2,figsize=(14,14))

for i in range(3):
    for j in range(2):
        axes[i,j].scatter(df_statistics[statistics_col[j+(i*2)]],alldata['target'])
        axes[i,j].set_title(statistics_col[j+(i*2)])

Number of words in sentense may affect on readability. Intuitively, it is reasonabole because students cannot understand the meaning clearly if the sentence is too long.

# Word type and dependency type
* Dose the ratio of word type (e.g. ADJ, NOUN, VERB etc) in the sentence affect on readability?
* Dose the ratio of dependency type (e.g. prep, proj, det etc) in the sentence affect on readability?

## word type

In [None]:
for i in range(len(alldata)):
    doc_id = alldata['id'][i]
    test_doc = nlp(alldata['excerpt'][i])
    token_pos = [w.pos_ for w in test_doc]
    c = collections.Counter(token_pos)
    token_df_temp = pd.DataFrame.from_dict(c, orient='index', columns=[doc_id])
    token_df_temp[doc_id] = token_df_temp[doc_id]/sum(token_df_temp[doc_id])
    
    if i == 0:
        token_df = token_df_temp
    else:
        token_df = token_df.merge(token_df_temp,left_index=True, right_index=True, how='outer')

In [None]:
token_df = token_df.fillna(0).T

In [None]:
fig,axes = plt.subplots(nrows=6,ncols=3,figsize=(16,24))
pos_name_list = token_df.columns

for i in range(6):
    for j in range(3):
        axes[i,j].scatter(token_df[pos_name_list[j+(i*3)]],alldata['target'])
        axes[i,j].set_title(pos_name_list[j+(i*3)])

**Wow, VERV ratio and NOUN ratio may affect on readability!**

## dependency type

In [None]:
for i in range(len(alldata)):
    doc_id = alldata['id'][i]
    test_doc = nlp(alldata['excerpt'][i])
    token_pos = [w.dep_ for w in test_doc]
    c = collections.Counter(token_pos)
    dep_df_temp = pd.DataFrame.from_dict(c, orient='index', columns=[doc_id])
    dep_df_temp[doc_id] = dep_df_temp[doc_id]/sum(dep_df_temp[doc_id])
    
    if i == 0:
        dep_df = dep_df_temp
    else:
        dep_df = dep_df.merge(dep_df_temp,left_index=True, right_index=True, how='outer')

In [None]:
dep_df = dep_df.fillna(0).T

In [None]:
fig,axes = plt.subplots(nrows=9,ncols=5,figsize=(16,24))
dep_name_list = dep_df.columns

for i in range(9):
    for j in range(5):
        #print(complexity_col[j+(i*2)])
        axes[i,j].scatter(dep_df[dep_name_list[j+(i*5)]],alldata['target'])
        axes[i,j].set_title(dep_name_list[j+(i*5)])

**Oh, some depencency types have effects on readability!**

# Dependency network analysis

## sentense as a network
* Using depencensy analysis result, we can analysis the sentense as a network. So we can extract some network indice (e.g. degree, density, diameter etc) as features.

In [None]:
test_doc = nlp(alldata['excerpt'][0])
test_sent = [s for s in test_doc.sents]

source_list = []
target_list = []

for token in test_sent[0]:
    if token.pos_=='PUNCT' or token.dep_=='ROOT':
        continue
    else:
        source_list.append(token.head.i)
        target_list.append(token.i)
        
df_dep = pd.DataFrame({'source':source_list,
                       'target':target_list})
print(df_dep)
dep_g = nx.from_pandas_edgelist(df_dep)
nx.draw_networkx(dep_g)

**It's super interesting!**

## Extract features from network analysis

In [None]:
def create_edge_df(doc):
    doc = nlp(doc)
    source_list = []
    target_list = []

    for token in doc:
        if token.pos_=='PUNCT' or token.dep_=='ROOT':
            continue
        else:
            source_list.append(token.head.i)
            target_list.append(token.i)

    df_dep = pd.DataFrame({'source':source_list,
                           'target':target_list})
    
    return(df_dep)

In [None]:
def create_dep_network(doc):
    edge_df = create_edge_df(doc)
    g = nx.from_pandas_edgelist(edge_df)
    return(g)

In [None]:
def calculate_index(doc):
    g = create_dep_network(doc)
    index_dict = {}
    # degree
    degree_df_temp = pd.DataFrame.from_dict(dict(nx.degree(g)), orient='index')
    # density
    density_list = [nx.density(g.subgraph(c)) for c in nx.connected_components(g)]
    # diameter
    diameter_list = [nx.diameter(g.subgraph(c)) for c in nx.connected_components(g)]
    
    index_dict = {'degree_min':degree_df_temp[0].min(),
                  'degree_max':degree_df_temp[0].max(),
                  'degree_mean':degree_df_temp[0].mean(),
                  'degree_median':degree_df_temp[0].median(),
                  'degree_std':degree_df_temp[0].std(),
                  'density_min':min(density_list),
                  'density_max':max(density_list),
                  'density_mean':statistics.mean(density_list),
                  'density_median':statistics.median(density_list),
                  'density_std':statistics.stdev(density_list),
                  'diameter_min':min(diameter_list),
                  'diameter_max':max(diameter_list),
                  'diameter_mean':statistics.mean(diameter_list),
                  'diameter_median':statistics.median(diameter_list),
                  'diameter_std':statistics.stdev(diameter_list)}
    return(index_dict)

In [None]:
for i in range(len(alldata)):
    doc_id = alldata['id'][i]
    doc = alldata['excerpt'][i]
    index_dict = calculate_index(doc)

    index_df_temp = pd.DataFrame.from_dict(index_dict, orient='index', columns=[doc_id])
    
    if i == 0:
        index_df = index_df_temp
    else:
        index_df = index_df.merge(index_df_temp,left_index=True, right_index=True, how='outer')
        
index_df = index_df.T

In [None]:
fig,axes = plt.subplots(nrows=5,ncols=3,figsize=(16,24))
index_name_list = index_df.columns

for i in range(5):
    for j in range(3):
        #print(complexity_col[j+(i*2)])
        axes[i,j].scatter(index_df[index_name_list[j+(i*3)]],alldata['target'])
        axes[i,j].set_title(index_name_list[j+(i*3)])

# Word vector from SpaCy

## extract word vector
* We can extract word vector from SpaCy default.
* If it is document, we can extract mean word vector.

In [None]:
test_doc = nlp(alldata['excerpt'][0])
test_sent = [s for s in test_doc.sents]
print('word vector for sentence : ', test_sent[0].vector)
print('word vector length : ', len(test_sent[0].vector))

## extract word vector from all document
* But before extraction, we need to some word type because those cause noise.
* Please imagine, the word 'it' has a vector, the punctuation ',' also has a vectore. Do these words/punctuation contribute to total meaning?

In [None]:
POS = ['PRON', 'PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV','ADP']
new_docs = []
docs = alldata['excerpt']
for doc in docs:
    doc = nlp(doc)
    new_docs.append(" ".join([token.lemma_ for token in doc if token.pos_ in POS or len(POS) ==0]))

In [None]:
new_wv_list = []

for doc in new_docs:
    doc_temp = nlp(doc)
    new_wv_list.append(doc_temp.vector)

In [None]:
df_wv_new = pd.DataFrame(new_wv_list)

## PCA for word vector
* Word vector in Spacy has 96 dimensions. Plotting them directly is difficult to understand.
* So I plotted some principle components vs target score.

In [None]:
pca = PCA(n_components = 10)
res_pca_10 = pca.fit_transform(df_wv_new)

In [None]:
fig,axes = plt.subplots(nrows=5,ncols=2,figsize=(16,32))

for i in range(5):
    for j in range(2):
        axes[i,j].scatter(res_pca_10[:,j+(i*2)],alldata['target'])
        axes[i,j].set_title('PCA'+str(j+(i*2)))

**Yes, 1st principal component related to target score!**