In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from collections import Counter
import string
from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS

**Import Dataset**

In [None]:
train_df=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample_sub=pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
print('Train data Shape: ',train_df.shape)
train_df.head(10)

In [None]:
imp_cols=['excerpt','target']
train_df=train_df[imp_cols]
train_df.head()

**Target Distribution Plot**

In [None]:
sns.displot(train_df['target'])

In [None]:
train_df['target'].describe()

**Sentences length Distribution**

In [None]:
#Lets take a look at length of sentences
sent_len=[len(i.split()) for i in train_df['excerpt']]
sns.displot(sent_len)

**Clean sentences**
* Remove puntuations
* Remove stop words
* Remove numbers and words containing numbers

In [None]:
#Now let's clean the paragraphs for better understanding of words and length
sp=stopwords.words('english')
#Remove punctuation
print('Cleaning Punctuations')
cleaned_text=[txt.translate(str.maketrans('','',string.punctuation)) for txt in train_df['excerpt']]

print('Cleaning numbers')
cleaned_text=[' '.join([i for i in txt.lower().split() if i.isalpha()]) for txt in cleaned_text]

print('Cleaning Stopwords')
cleaned_text=[' '.join(i for i in txt.split() if i not in sp) for txt in cleaned_text]


**Length distribution of sentences after cleaning**

In [None]:
#Let's take a look at how cleaning affected each sentence
sent_len=[len(i.split(),) for i in cleaned_text]
sns.displot(sent_len)

**Get frequency of each word in corpus**


In [None]:
#Counter rare words
corpus=' '.join([i for i in cleaned_text])
word_freq=Counter(corpus.split())

#Words with freq less than 10
frequent_words=[]
for word in tqdm(word_freq.keys()):
    if word_freq[word]<=10:
        frequent_words.append(word)

frequent_words=' '.join([i for i in frequent_words])

**Word Cloud of most rare words**

In [None]:
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate(frequent_words)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()