In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/reddit-vaccine-myths/reddit_vm.csv')

In [None]:
df = df[df.title != 'Comment']

In [None]:
df.head()

## The number of characters present in each sentence:

In [None]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

fig = sns.displot(x=df.title.str.len(), data=df, color='black', kde=False, height=6, kind='hist')

print(df.title.str.len().min())
print(df.title.str.len().max())
print(df.title.str.len().mean())

## The histogram shows that the titles range from 1 to 298 characters and generally, it is 99 characters on average.

In [None]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

temp = df.title.str.split().map(lambda x: len(x))

fig = sns.displot(x=temp, color='blue', kde=False, height=6, kind='hist')

print(temp.min())
print(temp.max())
print(temp.mean())

## The number of words in titles ranges from 1 to 50 words and is mostly 17 words.

In [None]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

temp = df.title.str.split().apply(lambda x: [len(i) for i in x]).map(lambda x: np.mean(x))

fig = sns.displot(x=temp, color='red', kde=False, height=6, aspect=2, kind='hist')

print(temp.min())
print(temp.max())
print(temp.mean())

## The average word length ranges between 1 to 26 with 5 being the most common length.

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

In [None]:
corpus = []
title = df.title.str.split()
title = title.values.tolist()
corpus = [word for i in title for word in i]

from collections import defaultdict

dic = defaultdict(int)

for word in corpus:
    if word in stop:
        dic[word] += 1

In [None]:
sorted_dic = list(reversed(sorted(list(dic.items()), key=lambda x: x[1])))

keys = [i[0] for i in sorted_dic[:10]]
values = [i[1] for i in sorted_dic[:10]]

sns.set(rc={'figure.figsize':(10,10)})

fig = sns.barplot(x=keys, y=values, palette='colorblind')

## We see the top 10 most used stopwords in all the titles.

In [None]:
from collections import Counter
from nltk.stem import PorterStemmer

ps = PorterStemmer()
counter = Counter(corpus)
most = counter.most_common()

x, y = [], []
lookup = []
for word,count in most[:120]:
    if (word.lower() not in stop) and (ps.stem(word.lower()) not in lookup) and word.isalpha():
        x.append(word)
        y.append(count)
        lookup.append(ps.stem(word.lower()))
        
sns.barplot(x=y,y=x)

## We see the most common words used in titles apart from the stopwords. 'vaccine' obviously dominates the tally. Other noteworthy words are 'polio', 'autism', 'flu', 'measles', 'CDC' and 'diseases'.

## N-Gram Exploration

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    fwords_freq = []
    for i in words_freq:
        temp = 0
        for j in i[0].split():
            if j in stop:
                temp += 1
        if temp != len(i[0].split()):
            fwords_freq.append(i)
    words_freq = fwords_freq
    words_freq =sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:10]

In [None]:
top_n_bigrams = get_top_ngram(df.title, 2)[:10]

x, y = map(list, zip(*top_n_bigrams)) 

sns.barplot(x=y, y=x, palette='hls')

## We see that bigram like 'big pharma', 'anti vax', 'covid 19' and bigrams related to vaccines dominate the titles.

In [None]:
top_n_trigrams = get_top_ngram(df.title, 3)[:10]

x, y = map(list, zip(*top_n_trigrams)) 

sns.barplot(x=y, y=x, palette='coolwarm')

## We observe the following:
## * The Reddit is US centric due to presence of trigram 'in the us'.
## * People are talking about 'covid 19 vaccine'.
## * People are against vaccines due to the trigram 'vaccines are bad'.
## * People are worried about unvaccinated children due to the trigram: 'an unvaccinated child'.
## * People are talking about 'anti vaccination movement'.
## * People want to discover the 'truth about vaccines'.

# Topic Modelling

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

def preprocess_news(df):
    corpus = []
    stem = PorterStemmer()
    lem = WordNetLemmatizer()
    for news in df.title:
        words = [w for w in word_tokenize(news) if (w.lower() not in stop and w.isalpha())]
        words = [lem.lemmatize(w) for w in words if len(w) > 2]
        corpus.append(words)
    return corpus

corpus = preprocess_news(df)

In [None]:
import gensim

dic = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dic,                                    
                                   passes = 10,
                                   workers = 2)
lda_model.show_topics()

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dic)
pyLDAvis.display(LDAvis_prepared)

## We discover the most talked about topics relate vaccination with autism, measles, polio and cancer.

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color=None,
        stopwords=stopwords,
        max_words=1000,
        max_font_size=30,
        scale=4,
        random_state=42,
        mode='RGBA',
        colormap='plasma')
   
    wordcloud=wordcloud.generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(corpus)

## We see from the wordcloud that most important words are vaccine, vaccination, kid, child, parent, Big Pharma, measles and autism.

# Sentiment Analysis

In [None]:
from textblob import TextBlob

def polarity(text):
    return TextBlob(text).sentiment.polarity

df.polarity_score = df.title.apply(lambda x : polarity(x))
df.polarity_score.hist(color='skyblue')

## We see that majority of the titles have a neutral polarity.

In [None]:
def sentiment(x):
    if x < 0:
        return 'neg'
    elif x == 0:
        return 'neu'
    else:
        return 'pos'
    
df.sentiment = df.polarity_score.map(lambda x: sentiment(x))

sns.barplot(x=df.sentiment.value_counts().index, y=df.sentiment.value_counts(), palette='coolwarm')

## We see that 46% of the titles have a neutral sentiment, 31% have a positive sentiment and 23% have a negative sentiment.
## Let's take a look at some of the positive and negative titles.

In [None]:
for i in df[df.sentiment == 'pos'].title.head():
    print(i)
    print()

## The positive titles are around COVID-19 vaccine and Anti-Vaxxers.

In [None]:
for i in df[df.sentiment == 'neg'].title.head():
    print(i)
    print()

## The negative titles are around COVID-19 vaccine, Anti-Vaxxers and Dangerous Myths.

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def ner(text):
    doc = nlp(text)
    return [X.label_ for X in doc.ents]

ent = df.title.apply(lambda x : ner(x))
ent = [x for sub in ent for x in sub]

counter = Counter(ent)
count = counter.most_common()

In [None]:
x, y = map(list, zip(*count))
sns.barplot(x=y, y=x, palette='husl')

## We see that ORG, PERSON and CARDINAL entities dominate the tally.

In [None]:
def ner(text, ent="ORG"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

org = df.title.apply(lambda x: ner(x))
org = [i for x in org for i in x]
counter = Counter(org)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y, x, palette='coolwarm')

## We see that organizations like CDC, FDA, Pfizer and Monsanto are the prime focus.

In [None]:
def ner(text, ent="PERSON"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

person = df.title.apply(lambda x: ner(x))
person = [i for x in person for i in x]
counter = Counter(person)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y, x, palette='viridis')

## We see Bill Gates mentioned in the titles.

In [None]:
def ner(text, ent="CARDINAL"):
    doc = nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

cardinal = df.title.apply(lambda x: ner(x))
cardinal = [i for x in cardinal for i in x]
counter = Counter(cardinal)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y, x, palette='twilight')

## We see a lot of numbers mentioned in the titles. Numbers work as a support for the argument, hence, their abundance in titles.

# POS Tagging
Noun (NN)- Joseph, London, table, cat, teacher, pen, city

Verb (VB)- read, speak, run, eat, play, live, walk, have, like, are, is

Adjective(JJ)- beautiful, happy, sad, young, fun, three

Adverb(RB)- slowly, quietly, very, always, never, too, well, tomorrow

Preposition (IN)- at, on, in, from, with, near, between, about, under

Conjunction (CC)- and, or, but, because, so, yet, unless, since, if

Pronoun(PRP)- I, you, we, they, he, she, it, me, us, them, him, her, this

Interjection (INT)- Ouch! Wow! Great! Help! Oh! Hey! Hi!

In [None]:
def pos(text):
    pos = nltk.pos_tag(word_tokenize(text))
    pos = list(map(list,zip(*pos)))[1]
    return pos

tags = df.title.apply(lambda x : pos(x))
tags = [x for l in tags for x in l]
counter = Counter(tags)

x, y = list(map(list,zip(*counter.most_common(6))))
sns.barplot(x=y, y=x, palette='coolwarm')

## We see nouns topping the charts followed by plural nouns and interjections. The facts are presented as is and not embellished (other adjectives would be there).
## Let's see the most prevalent nouns used.

In [None]:
def get_nouns(text):
    noun = []
    pos = nltk.pos_tag(word_tokenize(text))
    for word, tag in pos:
        if tag == 'NN':
            noun.append(word)
    return noun

words = df.title.apply(lambda x : get_nouns(x))
words = [x for l in words for x in l]
counter = Counter(words)

x, y = list(map(list,zip(*counter.most_common(10))))
sns.barplot(x=y, y=x, palette='magma')

## We see the highest usage of vaccine (of course!), followed by vaccination, autism, polio, child, anyone, disease, risk and cause.
## Let's see the plural nouns used.

In [None]:
def get_nouns(text):
    noun = []
    pos = nltk.pos_tag(word_tokenize(text))
    for word, tag in pos:
        if tag == 'NNS':
            noun.append(word)
    return noun

words = df.title.apply(lambda x : get_nouns(x))
words = [x for l in words for x in l]
counter = Counter(words)

x, y = list(map(list,zip(*counter.most_common(10))))
sns.barplot(x=y, y=x, palette='Accent')

## We see the usage of vaccines, children, people, Vaccines, measles, parents, vaccinations, kids, studies and years.
## Let's explore the complexity of text used in the titles.

In [None]:
! pip install textstat

In [None]:
from textstat import flesch_reading_ease

df.title.apply(lambda x : flesch_reading_ease(x)).hist(color='black')

## We see the readibility scores for the titles mostly fall after 50. This means the titles can be easily read and understood.
## Let's also check the titles with less readibility score.

In [None]:
df['reading'] = df.title.apply(lambda x : flesch_reading_ease(x))

for i in df[df.reading < 5].title:
    print(i)
    print()

## We see that short titles which convey no meaning have the least readability score.