# Data Extraction 
<ol>
    <li> <a href='#introduction'>Introdction</a></li>
    <li> <a href='#data_extraction'> Data Extraction </a></li>
    <li> <a href='#data_visualisation'> Data Visualisation </a></li>
    <li> <a href='#wordclouds'> Word Clouds </a></li>
    <li> <a href='#conclusion'> Conclusion </a></li>
</ol>

<h1 id="introduction">Introduction </h1>
Data preparation is basic process of any machine learning model.  

In [None]:
!pip install ../input/package/syllables-0.1.0-py2.py3-none-any.whl

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re # Regularexpresion
import syllables
import seaborn as sns
import math
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
color = sns.color_palette()
sns.set_style('darkgrid')
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
all_stopword = list(set(stopwords.words('english')))+['?','.',',','!',"''","``"]

In [None]:
df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [None]:
df.head(2)

In [None]:
df.loc[:,'excerpt']

In [None]:
print("train target min value ",df['target'].min())
print("train target min value ",df['target'].max())

<h1 id="data_extraction"> Data Extraction </h1>

In [None]:
'''
function for create unique words and remove stop words
'''
def only_words(wordlist):
    wordwithout_stopword = []
    for w in wordlist:
        if not w in all_stopword:
            wordwithout_stopword.append(w)
    return list(set(wordwithout_stopword))

'''
count total words in sentence
'''
def count_words(wordlist):
    return len(wordlist)

'''
Total sentence in paragraph
'''
def count_sentence(paragraph):
    return len(paragraph)

'''
Total syllables in sentence
'''
def count_syllables(wordlist):  
    syllables_list = [ syllables.estimate(single_word) for single_word  in wordlist]
    return sum(syllables_list)

'''
Average word length in sentence
'''
def average_word_length(wordlist):        
    maxword_list = [len(i) for i in wordlist]
    return sum(maxword_list)/len(maxword_list)

# Flesch-Kincaid reading formula
![](https://i.ibb.co/xsYLbCk/formula1.png)

In [None]:
def flesch_reading_level(total_words, total_sentence, total_syllables):
    flesch_level = 0.39*(total_words/total_sentence)+11.8*(total_syllables/total_words)-15.59
    return flesch_level

# Flesch-Dayani Score



![](https://i.ibb.co/s6FjDF5/formula2.png)

In [None]:
def flesch_dayani_score(total_words, total_sentence, total_syllables):
    flesch_dayani = 0.31-8.846*(total_syllables/total_words)- 1.01*(total_words/total_sentence)
    return flesch_dayani

In [None]:
train_word_token = list(map(word_tokenize, df.loc[:,'excerpt']))
test_word_token = list(map(word_tokenize, test.loc[:,'excerpt']))

In [None]:
df['only_words'] = list(map(only_words,train_word_token))
test['only_words'] = list(map(only_words,test_word_token))

In [None]:
df['sentence'] = list(map(sent_tokenize,  df.loc[:,'excerpt']))
test['sentence'] = list(map(sent_tokenize,  test.loc[:,'excerpt']))

In [None]:
df['total_words'] = list(map(count_words, df['only_words']))
test['total_words'] = list(map(count_words, test['only_words']))

In [None]:
df['total_sentence'] = list(map(count_sentence, df['sentence']))
test['total_sentence'] = list(map(count_sentence, test['sentence']))

In [None]:
df['total_syllables'] = list(map(count_syllables,  df['only_words']))
test['total_syllables'] = list(map(count_syllables,  test['only_words']))

In [None]:
df['average_word_length'] = list(map(average_word_length, df['only_words']))
test['average_word_length'] = list(map(average_word_length, test['only_words']))

In [None]:
df['FRL'] = list(map(flesch_reading_level, df['total_words'],df['total_sentence'],df['total_syllables'] ))
test['FRL'] = list(map(flesch_reading_level, test['total_words'],test['total_sentence'],test['total_syllables'] ))

In [None]:
df['FDS'] = list(map(flesch_dayani_score,  df['total_words'],df['total_sentence'],df['total_syllables'] ))
test['FDS'] = list(map(flesch_dayani_score,  test['total_words'],test['total_sentence'],test['total_syllables'] ))

In [None]:
def count_verb(wordlist):
    total_word = []
    verbs = []
    singular_nouns  =[]
    proper_nouns =[]
    adverb =[]
    tag = nltk.pos_tag(wordlist)
    grammar = "NP: {<RB.?>*<VB.?>*<NNP>*<NN>*}"
    cp  =nltk.RegexpParser(grammar)
    
    for w in list(tag):
        if w[1] == 'VB':
            verbs.append(len(w[0]))
        total_word.append(len(w[0]))
    return sum(verbs)/(sum(total_word)*100)

df['verbs'] = list(map(count_verb, df.loc[:,'only_words']))
test['verbs'] = list(map(count_verb, test.loc[:,'only_words']))

In [None]:
def verb_past(wordlist):
    total_word = []
    verbs_past = []
    tag = nltk.pos_tag(wordlist)    
    for w in list(tag):
        if w[1] == 'VBD':
            verbs_past.append(len(w[0]))
        total_word.append(len(w[0]))
    return sum(verbs_past)/(sum(total_word)*100)

df['verbs_past'] = list(map(verb_past, df.loc[:,'only_words']))
test['verbs_past'] = list(map(verb_past, test.loc[:,'only_words']))

In [None]:
def count_adverb(wordlist):
    total_word = []
    adverb = []
    tag = nltk.pos_tag(wordlist)    
    for w in list(tag):
        if w[1] == 'RB':
            adverb.append(len(w[0]))
        total_word.append(len(w[0]))
    return sum(adverb)/(sum(total_word)*100)

df['adverb'] = list(map(count_adverb, df.loc[:,'only_words']))
test['adverb'] = list(map(count_adverb, test.loc[:,'only_words']))

In [None]:
df.describe().transpose()

In [None]:
df.describe().transpose()[['mean', 'std']]

<h1 id="data_visualisation"> Data Visualisation </h1>

In [None]:
(fig, axs) = plt.subplots(nrows=2, ncols=2, figsize=(12,12))
axs[0,0].scatter(df['target'],df['total_words'], color='#e6005c80')
axs[0,0].set_title("Total Words")
axs[0,1].scatter(df['target'],df['total_sentence'],color='#00666680')
axs[0,1].set_title("Total Sentence")
axs[1,0].scatter(df['target'],df['total_syllables'],color='#66990080')
axs[1,0].set_title("Total Syllables")
axs[1,1].scatter(df['target'],df['average_word_length'],color='#80008080')
axs[1,1].set_title("Average Word Length")
fig.show()



In [None]:
(fig, axs) = plt.subplots(nrows=3, ncols=3, figsize=(12,12))
axs[0,0].scatter(df['target'],df['FRL'], color='#1ab2ff80')
axs[0,0].set_title("Flesch-Kincaid reading")
axs[0,1].scatter(df['target'],df['standard_error'], color='#80008080')
axs[0,1].set_title("Standard Error")
axs[0,2].scatter(df['target'],df['verbs_past'], color='#80008080')
axs[0,2].set_title("Verb Past")
axs[1,0].scatter(df['target'],df['FDS'], color='#00666680')
axs[1,0].set_title("Flesch-Dayani Score")
axs[1,1].scatter(df['target'],df['verbs'], color='#00666680')
axs[1,1].set_title("verb")
axs[1,2].scatter(df['target'],df['adverb'], color='#00666680')
axs[1,2].set_title("adverb")

fig.show()

In [None]:
sns.pairplot(df[['FRL', 'verbs_past', 'FDS', 'verbs','target']], diag_kind='kde')
plt.show()

<h1 id="wordclouds"> Word Clouds </h1>

In [None]:
#start with one review:
text = df.loc[0, 'excerpt']

#Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

#Display the Generated image:
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(max_font_size=50, max_words =100, 
                      background_color='white').generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wordcloud.to_file("first_review.png")

<h1 id="conclusion">Conclusion </h1>
Very welcome for any suggestions. I am keep working on it also finding a job. 
I wish to change my current job. 