In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
df['title'] = df['pub_title']

# **Preliminary Analysis**

**Number of characters present in the publication titles.**

In [None]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

fig = sns.displot(x=df.title.str.len(), data=df, color='black', kde=False, height=6, aspect=3, kind='hist')

print(df.title.str.len().min())
print(df.title.str.len().max())
print(df.title.str.len().mean())

We see the length of publications titles range 8 to 560 characters. On average, the publication title length is 96 characters.

**Number of words in publication titles**

In [None]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

temp = df.title.str.split().map(lambda x: len(x))

fig = sns.displot(x=temp, color='blue', kde=False, height=6, aspect=3, kind='hist')

print(temp.min())
print(temp.max())
print(temp.mean())

We see the number of words in the publications titles range from 1 to 84. On average, we have 12 words in a publication title.

**Word length in publication titles**

In [None]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

temp = df.title.str.split().apply(lambda x: [len(i) for i in x]).map(lambda x: np.mean(x))

fig = sns.displot(x=temp, color='red', kde=False, height=6, aspect=2, kind='hist')

print(temp.min())
print(temp.max())
print(temp.mean())

We see the number of characters in the words of publication titles vary from 2 to 16. On average the number of characters is 7.

**Let's see the stopwords in the titles now.**

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

In [None]:
corpus = []
title = df.title.str.split()
title = title.values.tolist()
corpus = [word for i in title for word in i]

from collections import defaultdict

dic = defaultdict(int)

for word in corpus:
    if word in stop:
        dic[word] += 1

In [None]:
sorted_dic = list(reversed(sorted(list(dic.items()), key=lambda x: x[1])))

keys = [i[0] for i in sorted_dic[:10]]
values = [i[1] for i in sorted_dic[:10]]

sns.set(rc={'figure.figsize':(10,10)})

fig = sns.barplot(x=keys, y=values, palette='colorblind')

We see the top 10 stopwords used in the titles.

**Most occuring words**

In [None]:
from collections import Counter
from nltk.stem import PorterStemmer

sns.set(rc={'figure.figsize':(15,15)})

ps = PorterStemmer()
counter = Counter(corpus)
most = counter.most_common()

x, y = [], []
lookup = []
for word,count in most[:120]:
    if (word.lower() not in stop) and (ps.stem(word.lower()) not in lookup) and word.isalpha():
        x.append(word)
        y.append(count)
        lookup.append(ps.stem(word.lower()))
        
sns.barplot(x=y,y=x)

We see the most common words. From the words, we can infer that the dataset has primarily 
publications dealing with diseases such as Alzheimer and dementia. 

# **N-Gram Exploration**

**Most common bigrams**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    fwords_freq = []
    for i in words_freq:
        temp = 0
        for j in i[0].split():
            if j in stop:
                temp += 1
        if temp != len(i[0].split()):
            fwords_freq.append(i)
    words_freq = fwords_freq
    words_freq =sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:100]

In [None]:
top_n_bigrams = get_top_ngram(df.title, 2)[:20]

x, y = map(list, zip(*top_n_bigrams)) 

sns.barplot(x=y, y=x, palette='hls')

Looking at the top 20 bigrams, we can see the publications being focused on Alzheimer and related cognitive impairment in older adults. Also, we see the Covid 19 publications fighting against the present pandemic.

**Most common trigrams**

In [None]:
top_n_trigrams = get_top_ngram(df.title, 3)[:20]

x, y = map(list, zip(*top_n_trigrams)) 

sns.barplot(x=y, y=x, palette='coolwarm')

**We see the top 20 trigrams appearing publications.We observe that the publications are focused on Alzheimer disease and its effects in USA.**

# Topic Modelling

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

def preprocess_news(df):
    corpus = []
    stem = PorterStemmer()
    lem = WordNetLemmatizer()
    for news in df.title:
        words = [w for w in word_tokenize(news) if (w.lower() not in stop and w.isalpha())]
        words = [lem.lemmatize(w) for w in words if len(w) > 2]
        corpus.append(words)
    return corpus

corpus = preprocess_news(df)

In [None]:
import gensim

dic = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dic,                                    
                                   passes = 10,
                                   workers = 2)
lda_model.show_topics()

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dic)
pyLDAvis.display(LDAvis_prepared)

We see the topic of Alzheimer being the most occuring topic in the publications. But, we have a new entry of publications focused on Agriculture in topic 4.

# Wordcloud Analysis

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color=None,
        stopwords=stopwords,
        max_words=1000,
        max_font_size=30,
        scale=4,
        random_state=42,
        mode='RGBA',
        colormap='plasma')
   
    wordcloud=wordcloud.generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(corpus)

We see Alzheimer as the focus of most publications, followed by dementia and aging.

# Sentiment Analysis

**We'll first see the polarity of the publication titles.**

In [None]:
from textblob import TextBlob

sns.set(rc={'figure.figsize':(6, 6)})

def polarity(text):
    return TextBlob(text).sentiment.polarity

df.polarity_score = df.title.apply(lambda x : polarity(x))
df.polarity_score.hist(color='skyblue')

We see that majority of the publications have a neutral polarity.

In [None]:
def sentiment(x):
    if x < 0:
        return 'neg'
    elif x == 0:
        return 'neu'
    else:
        return 'pos'

sns.set(rc={'figure.figsize':(6, 6)})
df.sentiment = df.polarity_score.map(lambda x: sentiment(x))

sns.barplot(x=df.sentiment.value_counts().index, y=df.sentiment.value_counts(), palette='coolwarm')

We see the most prevalent publication titles are neutral followed by positive and then, negative.

# NER Analysis

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def ner(text):
    doc = nlp(text)
    return [X.label_ for X in doc.ents]

ent = df.title.apply(lambda x : ner(x))
ent = [x for sub in ent for x in sub]
counter = Counter(ent)
count = counter.most_common()

In [None]:
x, y = map(list, zip(*count))
sns.set(rc={'figure.figsize':(15, 15)})
sns.barplot(x=y, y=x, palette='husl')

We see that ORG, GPE and DATE entities dominate the tally. Let's analyze them.

**We'll first see ORG entities.**

In [None]:
def ner(text, ent="ORG"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

org = df.title.parallel_apply(lambda x: ner(x))
org = [i for x in org for i in x]
counter = Counter(org)

x, y = map(list,zip(*counter.most_common(20)))
sns.barplot(y, x, palette='coolwarm')

We see various organizations (National Center for Educational Statistics) and names of studies (Baltimore Longitudnal Study of Aging) popping up here. We also get various abbreviations here which may not be organizations such as STEM. 

In [None]:
! pip install pandarallel

In [None]:
from pandarallel import pandarallel

pandarallel.initialize()

In [None]:
def ner(text, ent="GPE"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

person = df.title.apply(lambda x: ner(x))
person = [i for x in person for i in x]
counter = Counter(person)

x,y=map(list,zip(*counter.most_common(20)))
sns.barplot(y, x, palette='viridis')

We see various countries and places mentioned. It gives us an insight into geographical distribution \
of the data. We see the most publications coming form US followed by China, India, South Korea, South Africa, Japan and Canada.

In [None]:
def ner(text, ent="DATE"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

person = df.title.apply(lambda x: ner(x))
person = [i for x in person for i in x]
counter = Counter(person)

x,y=map(list,zip(*counter.most_common(20)))
sns.barplot(y, x, palette='viridis')

We see '1988' at the top of the tally. Why? Because of National Education Longitudinal Study of 1988 on which many publications are based. Followed by it are papers published in the time 1992 - 2020. We see a huge volume of the papers published in years 2009 - 2020.

# POS Tagging

**We'll now do Part-of-Speech Tagging.**

Here's the list of tags:

Noun (NN)- Joseph, London, table, cat, teacher, pen, city

Verb (VB)- read, speak, run, eat, play, live, walk, have, like, are, is

Adjective(JJ)- beautiful, happy, sad, young, fun, three

Adverb(RB)- slowly, quietly, very, always, never, too, well, tomorrow

Preposition (IN)- at, on, in, from, with, near, between, about, under

Conjunction (CC)- and, or, but, because, so, yet, unless, since, if

Pronoun(PRP)- I, you, we, they, he, she, it, me, us, them, him, her, this

Interjection (INT)- Ouch! Wow! Great! Help! Oh! Hey! Hi!

In [None]:
def pos(text):
    pos = nltk.pos_tag(word_tokenize(text))
    pos = list(map(list,zip(*pos)))[1]
    return pos

tags = df.title.parallel_apply(lambda x : pos(x))
tags = [x for l in tags for x in l]
counter = Counter(tags)

x, y = list(map(list,zip(*counter.most_common(6))))
sns.barplot(x=y, y=x, palette='coolwarm')

We see proper nouns topping the list followed by nouns and prepositions.

**Let's see the most prevalent plural nouns used.**

In [None]:
def get_nouns(text):
    noun = []
    pos = nltk.pos_tag(word_tokenize(text))
    for word, tag in pos:
        if tag == 'NNP' and word.isalpha():
            noun.append(word)
    return noun

words = df.title.parallel_apply(lambda x : get_nouns(x))
words = [x for l in words for x in l]
counter = Counter(words)

x, y = list(map(list,zip(*counter.most_common(10))))
sns.barplot(x=y, y=x, palette='magma')

We again see Alzheilmer topping the charts. Followed by it are its related words like Disease,Brain and Cognitive. 

**Let's the top nouns used.**

In [None]:
def get_nouns(text):
    noun = []
    pos = nltk.pos_tag(word_tokenize(text))
    for word, tag in pos:
        if tag == 'NN' and word.isalpha():
            noun.append(word)
    return noun

words = df.title.parallel_apply(lambda x : get_nouns(x))
words = [x for l in words for x in l]
counter = Counter(words)

x, y = list(map(list,zip(*counter.most_common(10))))
sns.barplot(x=y, y=x, palette='Accent')

We see the disease topping the chart followed by brain. The words revolve around the theme of brain and its impairment. For that purpose, many studies and analysis is done, which we see in the list.

# Text Complexity

In [None]:
! pip install textstat

In [None]:
from textstat import flesch_reading_ease

df.title.parallel_apply(lambda x : flesch_reading_ease(x)).hist(color='black')

We see that the readibility score mostly falls after 50, which means most of the publication titles can be read easily.

**Let's also see publication titles with readibility score less than 5.**

In [None]:
df['reading'] = df.title.parallel_apply(lambda x : flesch_reading_ease(x))

cnt = 0
for i in df[df.reading < 5].title:
    print(i)
    print()
    cnt += 1
    if cnt == 10:
        break

We see that short publication titles have low readibility score. Titles which have less used words such as disidentification, postsecondary etc. have less readiblity score.