# **Imports and installations**
Here we will be importing the necessary packages as well as installing the new ones. 

In [None]:
!pip install -U sentence-transformers
!pip3 install summa
!pip install textstat

In [None]:
import numpy as np
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import networkx as nx
import textstat

from torchtext import vocab
from summa import keywords
from wordcloud import WordCloud, STOPWORDS
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure

# **Loading the data**
In this section we load the dataset into python lists. 
The dataset contains three files- train_csv, test_csv and submission_csv. In the train_csv and test_csv we have the following columns- 
* id - unique ID for excerpt
* url_legal - URL of source - this is blank in the test set.
* license - license of source material - this is blank in the test set.
* excerpt - text to predict reading ease of
* target - reading ease. This score ranges from -4 to +2. Greater the number, greater the ease of readability. 
* standard_error - measure of spread of scores among multiple raters for each excerpt. Not included for test data.



In [None]:
train_data_csv_path = '../input/commonlitreadabilityprize/train.csv'
test_data_csv_path = '../input/commonlitreadabilityprize/test.csv'
sample_submission_csv_path = '../input/commonlitreadabilityprize/sample_submission.csv'

In [None]:
train_data = pd.read_csv(train_data_csv_path)
test_data = pd.read_csv(test_data_csv_path)

print("Length of training data: ",len(train_data))
print("Length of testing data: ",len(test_data))

We see the first few datapoints 

In [None]:
train_data.head(10)

In [None]:
text = train_data['excerpt'].values
text[0]

We will classify our data into easy and hard essays **for visualizaion purposes only**. Easy essays are the ones whose scores are greater than or equal to 0 and hard essays are the ones whose scores lesser than 0. 

In [None]:
easy_text = train_data[train_data['target'] >= -1]['excerpt']
hard_text = train_data[train_data['target'] < -1]['excerpt']

easy_score = train_data[train_data['target'] >= -1]['target']
hard_score = train_data[train_data['target'] < -1]['target']

print("Easy text present in the data: ",len(easy_text))
print("Hard text present in the data: ",len(hard_text))

# **Plotting word length frequency**


We will first try to understand the dataset by looking at each of the datapoints. We are plotting the frequency of words in our data. 

In [None]:
plt.rcParams["figure.figsize"] = (15,10)

def get_length_dict(text,type_of_text):
    mean_length = 0
    length_dict = {}
    for t in text:
        str_length = len(word_tokenize(t))
        if length_dict.get(str_length):
            length_dict[str_length] += 1
        else:
            length_dict[str_length] = 1
        mean_length += str_length 
    mean_length /= len(text)
    print("Mean word length of the {} data: {} ".format(type_of_text,mean_length))
    length_dict = {k: v for k, v in sorted(length_dict.items(),reverse=True,key=lambda item: item[1])[:25]}
    return length_dict

length_dict = get_length_dict(text,'Total')
plt.bar(range(len(length_dict)), list(length_dict.values()), align='center')
plt.xticks(range(len(length_dict)), list(length_dict.keys()))
plt.title()
plt.show()

As we can see in our plot, most of our datapoints contain 190-200 words. 

# **Visualizing Word Cloud**
Data cloud helps us to visualize most important keywords in the whole text corpora  

In [None]:
def create_title_word_cloud(text):
    wordcloud = WordCloud(
        width = 3000,
        height = 2000,
        background_color = 'black',
        stopwords = STOPWORDS).generate(str(text))
    return wordcloud

wordcloud = create_title_word_cloud(text)
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

As we can see from the above wordcloud,the words are common English words without much orthographic errors. 

# **Plotting n-grams**
We will plot the common unigrams, bigrams as well as trigrams for the whole text corpora 

In [None]:
def generate_top_k_n_grams(text,k,n):
    word_vectorizer = CountVectorizer(ngram_range=n,stop_words='english')
    sparse_matrix = word_vectorizer.fit_transform(text)
    frequencies = sum(sparse_matrix).toarray()[0]
    freq_df = pd.DataFrame({
        'words': word_vectorizer.get_feature_names(),
        'frequency': frequencies 
    })
    freq_df_sorted = freq_df.sort_values('frequency',ascending=False)[:k] 
    return freq_df_sorted

In [None]:
freq_data = generate_top_k_n_grams(text,30,(1,1))
sns.barplot(x = 'unigram words', y = 'frequency',data = freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=45,fontsize=13)
plt.yticks(fontsize=13)
plt.show()

In [None]:
freq_data = generate_top_k_n_grams(text,30,(2,2))
sns.barplot(x = 'bigram words', y = 'frequency',data = freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=45,fontsize=13)
plt.yticks(fontsize=13)
plt.show()

In [None]:
freq_data = generate_top_k_n_grams(text,30,(3,3))
sns.barplot(x = 'trigram words', y = 'frequency',data = freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=45,fontsize=13)
plt.yticks(fontsize=13)
plt.show()

# **Mean length of easy vs hard essays**
We compare the length of the easy and the hard essays. 

In [None]:
easy_length_dict = get_length_dict(easy_text,'Easy')
hard_length_dict = get_length_dict(hard_text,'Hard')

plt.subplot(1,2,1)
plt.bar(range(len(easy_length_dict)), list(easy_length_dict.values()), align='center')
plt.xticks(range(len(easy_length_dict)), list(easy_length_dict.keys()),rotation=45,fontsize=13)
plt.title("Easy essays")

plt.subplot(1,2,2)
plt.bar(range(len(hard_length_dict)), list(hard_length_dict.values()), align='center')
plt.xticks(range(len(hard_length_dict)), list(hard_length_dict.keys()),rotation=45,fontsize=13)
plt.title("Hard essays")
plt.show()

As we can see, the length of the essays doesn't play a major difference in their readability scores. 

# **N-gram comparison for easy vs hard essays**
We now compare the most common n-grams for easy as well as hard essays

In [None]:
easy_freq_data = generate_top_k_n_grams(easy_text,30,(1,1))
hard_freq_data = generate_top_k_n_grams(hard_text,30,(1,1))

plt.subplot(1,2,1)
sns.barplot(y = 'words', x = 'frequency',data = easy_freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=45,fontsize=13)
plt.yticks(fontsize=13)
plt.title('Easy text bigrams')

plt.subplot(1,2,2)
sns.barplot(y = 'words', x = 'frequency',data = hard_freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=20,fontsize=13)
plt.yticks(fontsize=13)
plt.title('Hard text unigrams')
plt.show()

In [None]:
easy_freq_data = generate_top_k_n_grams(easy_text,30,(2,2))
hard_freq_data = generate_top_k_n_grams(hard_text,30,(2,2))

plt.subplot(1,2,1)
sns.barplot(y = 'words', x = 'frequency',data = easy_freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=45,fontsize=13)
plt.yticks(fontsize=13)
plt.title('Easy text bigrams')

plt.subplot(1,2,2)
sns.barplot(y = 'words', x = 'frequency',data = hard_freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=20,fontsize=13)
plt.yticks(fontsize=13)
plt.title('Hard text bigrams')
plt.show()


In [None]:
easy_freq_data = generate_top_k_n_grams(easy_text,30,(3,3))
hard_freq_data = generate_top_k_n_grams(hard_text,30,(3,3))

plt.subplot(1,2,1)
sns.barplot(y = 'words', x = 'frequency',data = easy_freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=45,fontsize=13)
plt.yticks(fontsize=13)
plt.title('Easy text trigrams')

plt.subplot(1,2,2)
sns.barplot(y = 'words', x = 'frequency',data = hard_freq_data,
            palette = 'hls',
    )
plt.xticks(rotation=20,fontsize=13)
plt.yticks(fontsize=13)
plt.title('Hard text trigrams')
plt.show()

From the above frequencies of the common words, we can see that usually in hard essays, the frequency of most common words is less than that in easy essays. We can thus say that the hard essays have a more diverse set of unigrmas, bigrams and trigrams as compared to the easy essays. 

# **Easy wordcloud vs hard wordcloud**
We will try to plot the wordclouds for easy essays and hard essays. 

In [None]:
easy_wordcloud = create_title_word_cloud(easy_text)
hard_wordcloud = create_title_word_cloud(hard_text)

plt.subplot(1,2,1)
plt.imshow(easy_wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Easy text')
plt.tight_layout(pad=0)

plt.subplot(1,2,2)
plt.imshow(hard_wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Hard text')
plt.tight_layout(pad=0)

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=2, 
                    top=2.5, 
                    wspace=0.2, 
                    hspace=0.2)
plt.show()

# **Plotting the document embeddings of essays**
Here, we are using bert to get the document embeddings of the datapoints and then using PCA we are plotting it in 2-dimensional and 3-dimensional space. 

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

easy_text_slice = easy_text[:100]
hard_text_slice = hard_text[:100]

easy_embeddings = []
for text in easy_text_slice:
    easy_embeddings.append(model.encode(text))
    
hard_embeddings = []
for text in hard_text_slice:
    hard_embeddings.append(model.encode(text))

In [None]:
pca_2 = PCA(n_components = 2)
total_text = easy_embeddings + hard_embeddings 
pca_2.fit(total_text)

easy_pca_output = pca_2.transform(easy_embeddings)
hard_pca_output = pca_2.transform(hard_embeddings)

easy_df = pd.DataFrame(easy_pca_output)
type_col = ['easy'] * len(easy_df)
easy_df.insert(2, 'type_of_data', type_col)

hard_df = pd.DataFrame(hard_pca_output)
type_col = ['hard'] * len(hard_df)
hard_df.insert(2, 'type_of_data', type_col)

total_df = easy_df.append(hard_df, ignore_index=True)
total_df.head()

fig = px.scatter(
    total_df.iloc[: , :2], x=0, y=1, color=total_df['type_of_data'],
    title='Plotting document embeddings in 2 dimensions'
)
fig.show()

In [None]:
pca = PCA(n_components = 3)
total_text = easy_embeddings + hard_embeddings 
pca.fit(total_text)

easy_pca_output = pca.transform(easy_embeddings)
hard_pca_output = pca.transform(hard_embeddings)

easy_df = pd.DataFrame(easy_pca_output)
type_col = ['easy'] * len(easy_df)
easy_df.insert(3, 'type_of_data', type_col)

hard_df = pd.DataFrame(hard_pca_output)
type_col = ['hard'] * len(hard_df)
hard_df.insert(3, 'type_of_data', type_col)

total_df = easy_df.append(hard_df, ignore_index=True)
total_df.head()


fig = px.scatter_3d(
    total_df.iloc[: , :3], x=0, y=1, z=2, color=total_df['type_of_data'],
    title='Plotting document embeddings in 3 dimensions'
)
fig.show()

# **Visualizing n-gram keywords in 2D and 3D space**
We first find the keywords using summa library. Then, we use 300-dimensional glove vector embeddings for those keywords. Then using PCA, we plot them in 2D as well as 3D space. 

In [None]:
easy_keywords = []
for text in easy_text:
    easy_keywords.append(keywords.keywords(text).split('\n'))

hard_keywords = []
for text in hard_text:
    hard_keywords.append(keywords.keywords(text).split('\n'))

In [None]:
easy_keyword_set = list({e for l in easy_keywords for e in l})
hard_keyword_set = list({e for l in hard_keywords for e in l})

print(len(easy_keyword_set),len(hard_keyword_set))

In [None]:
!cp -r  ../input/glove6b/glove.6B.300d.txt ./

In [None]:
VECTOR_PATH = './'
VECTOR_NAME = 'glove.6B.300d.txt'

embeddings = vocab.Vectors(VECTOR_NAME,VECTOR_PATH)

sliced_easy_keyword_set = easy_keyword_set[:100]
sliced_hard_keyword_set = hard_keyword_set[:100]

easy_embeddings = []
for word in sliced_easy_keyword_set:
    if not  all(x == 0 for x in embeddings[word].tolist()):
        easy_embeddings.append(embeddings[word])

hard_embeddings = []
for word in sliced_hard_keyword_set:
    if not  all(x == 0 for x in embeddings[word].tolist()):
        hard_embeddings.append(embeddings[word])

print(len(easy_embeddings),len(hard_embeddings))

In [None]:
pca_2 = PCA(n_components = 2)
total_text = easy_embeddings + hard_embeddings
total_text_list = [x.numpy() for x in total_text]
pca_2.fit(total_text_list)

easy_embeddings_list = [x.numpy() for x in easy_embeddings]
hard_embeddings_list = [x.numpy() for x in hard_embeddings]
easy_pca_output = pca_2.transform(easy_embeddings_list)
hard_pca_output = pca_2.transform(hard_embeddings_list)

easy_df = pd.DataFrame(easy_pca_output)
type_col = ['easy'] * len(easy_df)
easy_df.insert(2, 'type_of_data', type_col)

hard_df = pd.DataFrame(hard_pca_output)
type_col = ['hard'] * len(hard_df)
hard_df.insert(2, 'type_of_data', type_col)

total_df = easy_df.append(hard_df, ignore_index=True)
total_df.head()

fig = px.scatter(
    total_df.iloc[: , :2], x=0, y=1, color=total_df['type_of_data'],
    title='Plotting keyword embeddings in 2 dimensions'
)
fig.show()

In [None]:
pca_3 = PCA(n_components = 3)
total_text = easy_embeddings + hard_embeddings
total_text_list = [x.numpy() for x in total_text]
pca_3.fit(total_text_list)

easy_embeddings_list = [x.numpy() for x in easy_embeddings]
hard_embeddings_list = [x.numpy() for x in hard_embeddings]
easy_pca_output = pca_3.transform(easy_embeddings_list)
hard_pca_output = pca_3.transform(hard_embeddings_list)

easy_df = pd.DataFrame(easy_pca_output)
type_col = ['easy'] * len(easy_df)
easy_df.insert(3, 'type_of_data', type_col)

hard_df = pd.DataFrame(hard_pca_output)
type_col = ['hard'] * len(hard_df)
hard_df.insert(3, 'type_of_data', type_col)

total_df = easy_df.append(hard_df, ignore_index=True)
total_df.head()

fig = px.scatter_3d(
    total_df.iloc[: , :3], x=0, y=1,z=2, color=total_df['type_of_data'],
    title='Plotting keyword embeddings in 3 dimensions'
)
fig.show()

# **Checking word and character average count**
We calculate the words per sentence, characters per sentence and characters per word for both easy and hard text. 

In [None]:
easy_words_per_sentences = []
easy_characters_per_sentences = []
easy_characters_per_word = []
for text in easy_text:
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    easy_words_per_sentences.append(len(words)/len(sentences))
    easy_characters_per_sentences.append(len(str(''.join(words)))/len(sentences))
    easy_characters_per_word.append(len(str(''.join(words)))/len(words))
    

hard_words_per_sentences = []
hard_characters_per_sentences = []
hard_characters_per_word = []
for text in hard_text:
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    hard_words_per_sentences.append(len(words)/len(sentences))
    hard_characters_per_sentences.append(len(str(''.join(words)))/len(sentences))
    hard_characters_per_word.append(len(str(''.join(words)))/len(words))

print("Easy characters per words: ",sum(easy_characters_per_word)/len(easy_characters_per_word))
print("Easy characters per sentences: ",sum(easy_characters_per_sentences)/len(easy_characters_per_sentences))
print("Easy words per sentences: ",sum(easy_words_per_sentences)/len(easy_words_per_sentences))


print("Hard characters per words: ",sum(hard_characters_per_word)/len(hard_characters_per_word))
print("Hard characters per sentences: ",sum(hard_characters_per_sentences)/len(hard_characters_per_sentences))
print("Hard words per sentences: ",sum(hard_words_per_sentences)/len(hard_words_per_sentences))

In [None]:
plt.subplot(1,3,1)
plt.title('Score v/s Characters per word')
plt.scatter(easy_characters_per_word,list(easy_score.values))
plt.scatter(hard_characters_per_word,list(hard_score.values))

plt.subplot(1,3,2)
plt.title('Score v/s Characters per sentences')
plt.scatter(easy_characters_per_sentences ,list(easy_score.values))
plt.scatter(hard_characters_per_sentences,list(hard_score.values))


plt.subplot(1,3,3)
plt.title('Score v/s Words per sentences')
plt.scatter(easy_words_per_sentences ,list(easy_score.values))
plt.scatter(hard_words_per_sentences, list(hard_score.values))


plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=2, 
                    top=1.2, 
                    wspace=0.4, 
                    hspace=0.4)

plt.show()

In [None]:
plt.subplot(1,3,1)
sns.kdeplot(data=easy_characters_per_word,color='blue')
sns.kdeplot(data=hard_characters_per_word,color='red')
plt.title('Characters per Word')

plt.subplot(1,3,2)
sns.kdeplot(data=easy_characters_per_sentences,color='blue')
sns.kdeplot(data=hard_characters_per_sentences,color='red')
plt.title('Characters per Sentences')

plt.subplot(1,3,3)
sns.kdeplot(data=easy_words_per_sentences,color='blue')
sns.kdeplot(data=hard_words_per_sentences,color='red')
plt.title('Words per Sentences')


plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=2, 
                    top=1.2, 
                    wspace=0.4, 
                    hspace=0.4)

plt.show()

We can see from the above density plots that the density funtion for the hard essays in all the three grpahs are a little right shifted as compared to the easy essays. Right shift indicates that the words per sentence, characters per sentence and the characters per word are more in hard essays than in easy essays.

# **POS Tagging for easy vs hard essays**
We use spacy to extract different parts of speech and then plot their count vs socre. 

In [None]:
nlp = spacy.load('en_core_web_sm')

easy_dictionary = {'NOUN':[],'PROPN':[],'VERB':[],'ADJ':[],'ADP':[],'AUX':[],}
easy_dictionary_per_sentence = {'NOUN':[],'PROPN':[],'VERB':[],'ADJ':[],'ADP':[],'AUX':[],}
hard_dictionary = {'NOUN':[],'PROPN':[],'VERB':[],'ADJ':[],'ADP':[],'AUX':[],}
hard_dictionary_per_sentence = {'NOUN':[],'PROPN':[],'VERB':[],'ADJ':[],'ADP':[],'AUX':[],}

for index,text in enumerate(easy_text):
    doc = nlp(text)
    for key,value in easy_dictionary.items():
        easy_dictionary[key].append(0)
    for ent in doc:
        if easy_dictionary.get(ent.pos_):
            easy_dictionary[ent.pos_][-1]+=1
    for key,value in easy_dictionary_per_sentence.items():
        easy_dictionary_per_sentence[key].append(easy_dictionary[key][index]/len(sent_tokenize(text)))

for index,text in enumerate(hard_text):
    doc = nlp(text)
    for key,value in hard_dictionary.items():
        hard_dictionary[key].append(0)
    for ent in doc:
        if hard_dictionary.get(ent.pos_):
            hard_dictionary[ent.pos_][-1]+=1
    for key,value in hard_dictionary_per_sentence.items():
        hard_dictionary_per_sentence[key].append(hard_dictionary[key][index]/len(sent_tokenize(text)))
        

In [None]:
plt.subplot(2,3,1)
plt.scatter(easy_dictionary_per_sentence['NOUN'],easy_score,color="blue")
plt.scatter(hard_dictionary_per_sentence['NOUN'],hard_score,color="lightblue")
plt.title('Nouns per sentence vs score')

plt.subplot(2,3,2)
plt.scatter(easy_dictionary_per_sentence['PROPN'],easy_score,color="green")
plt.scatter(hard_dictionary_per_sentence['PROPN'],hard_score,color="lightgreen")
plt.title('Proper nouns per sentence vs score')

plt.subplot(2,3,3)
plt.scatter(easy_dictionary_per_sentence['VERB'],easy_score,color="red")
plt.scatter(hard_dictionary_per_sentence['VERB'],hard_score,color="pink")
plt.title('Verbs per sentence vs score')

plt.subplot(2,3,4)
plt.scatter(easy_dictionary_per_sentence['ADJ'],easy_score,color="orange")
plt.scatter(hard_dictionary_per_sentence['ADJ'],hard_score,color="yellow")
plt.title('Adjectives per sentence vs score')

plt.subplot(2,3,5)
plt.scatter(easy_dictionary_per_sentence['ADP'],easy_score,color="darkgrey")
plt.scatter(hard_dictionary_per_sentence['ADP'],hard_score,color="lightgrey")
plt.title('ADP per sentence vs score')

plt.subplot(2,3,6)
plt.scatter(easy_dictionary_per_sentence['AUX'],easy_score,color="brown")
plt.scatter(hard_dictionary_per_sentence['AUX'],hard_score,color="peru")
plt.title('Auxillary verbs per sentence vs score')


plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=2, 
                    top=2.5, 
                    wspace=0.4, 
                    hspace=0.4)

In [None]:
plt.subplot(2,3,1)
plt.scatter(easy_dictionary['NOUN'],easy_score,color="blue")
plt.scatter(hard_dictionary['NOUN'],hard_score,color="lightblue")
plt.title('Nouns per data point vs score')

plt.subplot(2,3,2)
plt.scatter(easy_dictionary['PROPN'],easy_score,color="green")
plt.scatter(hard_dictionary['PROPN'],hard_score,color="lightgreen")
plt.title('Proper nouns per data point vs score')

plt.subplot(2,3,3)
plt.scatter(easy_dictionary['VERB'],easy_score,color="red")
plt.scatter(hard_dictionary['VERB'],hard_score,color="pink")
plt.title('Verbs per data point vs score')

plt.subplot(2,3,4)
plt.scatter(easy_dictionary['ADJ'],easy_score,color="orange")
plt.scatter(hard_dictionary['ADJ'],hard_score,color="yellow")
plt.title('Adjectives per data point vs score')

plt.subplot(2,3,5)
plt.scatter(easy_dictionary['ADP'],easy_score,color="darkgrey")
plt.scatter(hard_dictionary['ADP'],hard_score,color="lightgrey")
plt.title('ADP per data point vs score')

plt.subplot(2,3,6)
plt.scatter(easy_dictionary_per_sentence['AUX'],easy_score,color="brown")
plt.scatter(hard_dictionary_per_sentence['AUX'],hard_score,color="peru")
plt.title('Auxillary verbs per sentence vs score')


plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=2, 
                    top=2.5, 
                    wspace=0.4, 
                    hspace=0.4)

# **Additional readability scores**
We compute additional reasablitlity scores using [textstat](https://github.com/shivam5992/textstat) library. The scores used are:
* Flesch reading ease
* Flesch kincaid grade
* Gunning Fog
* Smog Index
* Automated Readability Index
* Coleman liau index
* Linsear write formula
* Dale chall readability score

We then calculate the correlation between the given score and the above calculated scores.

In [None]:
total_text = list(easy_text.values) + list(hard_text.values)
scores = list(easy_score.values) + list(hard_score.values)
scores_dict = {'flesch_reading_ease':[],'flesch_kincaid_grade':[],'gunning_fog':[],
              'smog_index':[],'automated_readability_index':[],'coleman_liau_index':[],
              'linsear_write_formula':[],'dale_chall_readability_score':[],
              'given_scores':scores}

for text in total_text:
    scores_dict['flesch_reading_ease'].append(textstat.flesch_reading_ease(text))
    scores_dict['flesch_kincaid_grade'].append(textstat.flesch_kincaid_grade(text))
    scores_dict['gunning_fog'].append(textstat.gunning_fog(text))
    scores_dict['smog_index'].append(textstat.smog_index(text))
    scores_dict['automated_readability_index'].append(textstat.automated_readability_index(text))
    scores_dict['coleman_liau_index'].append(textstat.coleman_liau_index(text))
    scores_dict['linsear_write_formula'].append(textstat.linsear_write_formula(text))
    scores_dict['dale_chall_readability_score'].append(textstat.dale_chall_readability_score(text))

# combination of all
# textstat.text_standard(text, float_output=False)

In [None]:
scores_dataframe = pd.DataFrame(scores_dict)

corrMatrix = scores_dataframe.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

From the above correlation heatmap we can see that the correlation for given score is negative for all the additional scores except for flesch reading ease.

### *That's all for now. Please upvote it if you liked the notebook :)*