In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df.title = df.excerpt

In [None]:
min(df.target.tolist())
# df

# Preliminary Analysis

## Number of characters present in the publication titles.

In [None]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

fig = sns.displot(x=df.excerpt.str.len(), data=df, color='black', kde=False, height=6, aspect=3, kind='hist')

print(df.excerpt.str.len().min())
print(df.excerpt.str.len().max())
print(df.excerpt.str.len().mean())

We see the length of excerpts range from 669 to 1341 characters. On average, the publication title length is 972.

## Number of words in publication titles

In [None]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

temp = df.title.str.split().map(lambda x: len(x))

fig = sns.displot(x=temp, color='blue', kde=False, height=6, aspect=3, kind='hist')

print(temp.min())
print(temp.max())
print(temp.mean())

We see the number of words in the excerpt range from 135 to 205. On average, we have 172 words in an excerpt.

# Most occuring words

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

corpus = []
title = df.title.str.split()
title = title.values.tolist()
corpus = [word for i in title for word in i]

from collections import defaultdict

dic = defaultdict(int)

for word in corpus:
    if word in stop:
        dic[word] += 1

In [None]:
from collections import Counter
from nltk.stem import PorterStemmer

sns.set(rc={'figure.figsize':(15,15)})

ps = PorterStemmer()
counter = Counter(corpus)
most = counter.most_common()

x, y = [], []
lookup = []
for word,count in most[:120]:
    if (word.lower() not in stop) and (ps.stem(word.lower()) not in lookup) and word.isalpha():
        x.append(word)
        y.append(count)
        lookup.append(ps.stem(word.lower()))
        
sns.barplot(x=y,y=x)

# N-Gram Exploration

## Most common bigrams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    fwords_freq = []
    for i in words_freq:
        temp = 0
        for j in i[0].split():
            if j in stop:
                temp += 1
        if temp != len(i[0].split()):
            fwords_freq.append(i)
    words_freq = fwords_freq
    words_freq =sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:100]

In [None]:
top_n_bigrams = get_top_ngram(df.title, 2)[:20]

x, y = map(list, zip(*top_n_bigrams)) 

sns.barplot(x=y, y=x, palette='hls')

## Most common trigrams

In [None]:
top_n_trigrams = get_top_ngram(df.title, 3)[:20]

x, y = map(list, zip(*top_n_trigrams)) 

sns.barplot(x=y, y=x, palette='coolwarm')

# Topic Modelling

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

def preprocess_news(df):
    corpus = []
    stem = PorterStemmer()
    lem = WordNetLemmatizer()
    for news in df.title:
        words = [w for w in word_tokenize(news) if (w.lower() not in stop and w.isalpha())]
        words = [lem.lemmatize(w) for w in words if len(w) > 2]
        corpus.append(words)
    return corpus

corpus = preprocess_news(df)

In [None]:
import gensim

dic = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dic,                                    
                                   passes = 10,
                                   workers = 2)
lda_model.show_topics()

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dic)
pyLDAvis.display(LDAvis_prepared)

# Wordcloud Analysis

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color=None,
        stopwords=stopwords,
        max_words=1000,
        max_font_size=30,
        scale=4,
        random_state=42,
        mode='RGBA',
        colormap='plasma')
   
    wordcloud=wordcloud.generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(corpus)

# Sentiment Analysis

## We'll first see the polarity of the publication titles.

In [None]:
from textblob import TextBlob

sns.set(rc={'figure.figsize':(6, 6)})

def polarity(text):
    return TextBlob(text).sentiment.polarity

df.polarity_score = df.title.apply(lambda x : polarity(x))
df.polarity_score.hist(color='skyblue')

In [None]:
def sentiment(x):
    if x < 0:
        return 'neg'
    elif x == 0:
        return 'neu'
    else:
        return 'pos'

sns.set(rc={'figure.figsize':(6, 6)})
df.sentiment = df.polarity_score.map(lambda x: sentiment(x))

sns.barplot(x=df.sentiment.value_counts().index, y=df.sentiment.value_counts(), palette='coolwarm')

# NER Analysis

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def ner(text):
    doc = nlp(text)
    return [X.label_ for X in doc.ents]

ent = df.title.apply(lambda x : ner(x))
ent = [x for sub in ent for x in sub]
counter = Counter(ent)
count = counter.most_common()

In [None]:
x, y = map(list, zip(*count))
sns.set(rc={'figure.figsize':(15, 15)})
sns.barplot(x=y, y=x, palette='husl')

In [None]:
! pip install pandarallel

In [None]:
from pandarallel import pandarallel

pandarallel.initialize()

In [None]:
def ner(text, ent="PERSON"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

org = df.title.parallel_apply(lambda x: ner(x))
org = [i for x in org for i in x]
counter = Counter(org)

x, y = map(list,zip(*counter.most_common(20)))
sns.barplot(y, x, palette='coolwarm')

In [None]:
def ner(text, ent="CARDINAL"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

org = df.title.parallel_apply(lambda x: ner(x))
org = [i for x in org for i in x]
counter = Counter(org)

x, y = map(list,zip(*counter.most_common(20)))
sns.barplot(y, x, palette='coolwarm')

In [None]:
def ner(text, ent="DATE"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

org = df.title.parallel_apply(lambda x: ner(x))
org = [i for x in org for i in x]
counter = Counter(org)

x, y = map(list,zip(*counter.most_common(20)))
sns.barplot(y, x, palette='coolwarm')

In [None]:
def ner(text, ent="GPE"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

org = df.title.parallel_apply(lambda x: ner(x))
org = [i for x in org for i in x]
counter = Counter(org)

x, y = map(list,zip(*counter.most_common(20)))
sns.barplot(y, x, palette='coolwarm')

In [None]:
def ner(text, ent="ORG"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

org = df.title.parallel_apply(lambda x: ner(x))
org = [i for x in org for i in x]
counter = Counter(org)

x, y = map(list,zip(*counter.most_common(20)))
sns.barplot(y, x, palette='coolwarm')

# POS Tagging
## We'll now do Part-of-Speech Tagging.

**Here's the list of tags**:

Noun (NN)- Joseph, London, table, cat, teacher, pen, city

Verb (VB)- read, speak, run, eat, play, live, walk, have, like, are, is

Adjective(JJ)- beautiful, happy, sad, young, fun, three

Adverb(RB)- slowly, quietly, very, always, never, too, well, tomorrow

Preposition (IN)- at, on, in, from, with, near, between, about, under

Determiner (DT) - one, many

Conjunction (CC)- and, or, but, because, so, yet, unless, since, if

Pronoun(PRP)- I, you, we, they, he, she, it, me, us, them, him, her, this

Interjection (INT)- Ouch! Wow! Great! Help! Oh! Hey! Hi!

In [None]:
def pos(text):
    pos = nltk.pos_tag(word_tokenize(text))
    pos = list(map(list,zip(*pos)))[1]
    return pos

tags = df.title.parallel_apply(lambda x : pos(x))
tags = [x for l in tags for x in l]
counter = Counter(tags)

x, y = list(map(list,zip(*counter.most_common(6))))
sns.barplot(x=y, y=x, palette='coolwarm')

In [None]:
def get_nouns(text):
    noun = []
    pos = nltk.pos_tag(word_tokenize(text))
    for word, tag in pos:
        if tag == 'NN' and word.isalpha():
            noun.append(word)
    return noun

words = df.title.parallel_apply(lambda x : get_nouns(x))
words = [x for l in words for x in l]
counter = Counter(words)

x, y = list(map(list,zip(*counter.most_common(10))))
sns.barplot(x=y, y=x, palette='magma')

In [None]:
def get_nouns(text):
    noun = []
    pos = nltk.pos_tag(word_tokenize(text))
    for word, tag in pos:
        if tag == 'JJ' and word.isalpha():
            noun.append(word)
    return noun

words = df.title.parallel_apply(lambda x : get_nouns(x))
words = [x for l in words for x in l]
counter = Counter(words)

x, y = list(map(list,zip(*counter.most_common(10))))
sns.barplot(x=y, y=x, palette='magma')