In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import regex as re
from wordcloud import WordCloud
import spacy
import collections
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])
import nltk
from nltk.util import ngrams    

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
df.columns

In [None]:
df['excerpt'][0]

In [None]:
df['target'].plot.hist()

In [None]:
df['standard_error'].plot.hist()

In [None]:
plt.scatter(df['target'],df['standard_error'])

Standard error can be seen high where target values are very high(too tough to read)/ very low(too easy to read)

In [None]:
df[df['target']==0]

## Excerpt data preprocessing

In [None]:
regex_http = re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
regex_alpha = re.compile('[^0-9A-Za-z\s]') #remove all non alphanumeric characters
regex_numbers = re.compile(r'\b\d+[a-z]*|[a-z]*\d+\b')
regex_space = re.compile('\s+')
d = {regex_http:' ', regex_alpha:' ', regex_numbers: ' ', regex_space: ' ' , '\xa0' : ' '}

In [None]:
df['clean_sentence'] = df.excerpt.replace(d, regex=True)
df['clean_sentence'] = df['clean_sentence'].apply(lambda x : x.lower())

In [None]:
# Lemmatization with stopwords removal using spacy
df['lemmatized']=df['clean_sentence'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if ((not token.is_punct) and (token.is_stop==False))]))

## Frequent words in train excerpt data

In [None]:
words = ' '.join(df['lemmatized']).split()
count = collections.Counter(words)
count.most_common(15)

In [None]:
word_df = pd.DataFrame(count.most_common(30),columns = ['Word','Frequency'])

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))

# Plot horizontal bar graph
word_df.sort_values(by='Frequency').plot.barh(x='Word',
                      y='Frequency',
                      ax=ax)

ax.set_title("Common Words Found in Excerpts after cleaning")

plt.show()

In [None]:
wordcloud = WordCloud(width = 3000, 
                      height = 2000, 
                      random_state=1, 
                      background_color='black', 
                      colormap='Set2', 
                      collocations=False).generate(" ".join(list(df['lemmatized'])))

# Save image
wordcloud.to_file("wordcloud.png")

# plot the WordCloud image                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()

## Top Bigrams and Trigrams 

In [None]:
text = " ".join(list(df['lemmatized']))
tokens = nltk.word_tokenize(text)
bigram_words = nltk.bigrams(tokens)
trigram_words = nltk.trigrams(tokens)

In [None]:
bi_fdist = nltk.FreqDist(bigram_words)
tri_fdist = nltk.FreqDist(trigram_words)

In [None]:
bi_fdist.most_common(15)

In [None]:
tri_fdist.most_common(15)