In [None]:
import pandas as pd # data processing, CSV file I/O.
import numpy as np # linear algebra.
import re #regular expression (check if a string contains the specified search pattern).
import nltk #Natural Language ToolKit(language processing).
nltk.download('stopwords') # filter out useless word.
from nltk.corpus import stopwords #word that can be ignore in programming language.
import matplotlib.pyplot as plt #Basic visualization.
from wordcloud import WordCloud #Visual word frequency
from textblob import TextBlob #To get polarize and sentiment from dataset
from nltk.stem import PorterStemmer #To completely cleaned texts
pd.set_option('display.max_colwidth', None) #Set display to show all text in columns

In [None]:
#Open data in csv format using Pandas.
data = pd.read_csv('/kaggle/input/reddit-vaccine-myths/reddit_vm.csv')

In [None]:
data.head() #Print the 5th head dataset.

In [None]:
data.info() #Print basic information from the dataset, that is range index, data columns including type of columns, and memory usage of the dataset.

In [None]:
data.isnull().sum() #Print total of every non-values in a dataset. Non values are in url and body columns but latter, I just choose title and body so I just thinking about thosw columns later.

# **Sentiment Based on Title**

In [None]:
#Making dataframe that not contain comment values in a title column. This is because comment values haven't any text so it better to delete them.
data_title = data[~data.title.str.contains("Comment")] #

In [None]:
#Delete commonly used word that prevent machine to ignore them.
stop = stopwords.words('english')
data_title['title'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
#Making cleaning function to remove any punctuations.
def review_cleaning(text):
    
    # Removing unuseful character.
    text = re.sub(r'([!”#$%&’()*+,-./:;<=>?[\]^_`{|}~])', ' ', text)
    # Removing link of the message.
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'https\S+', ' ', text)
    # Removing a string of whitespace characters.
    text = re.sub(r'\s+', ' ', text)
    # Removing numbers.
    text = re.sub(r'\d+', ' ', text)
    # Removing special characters.
    text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
    return text

In [None]:
#Apply remove punctuations function.
data_title['title'] = data_title['title'].apply(review_cleaning) # Apply cleanser.

In [None]:
#Process for removing the commoner morphological and inflexional endings from words in English.
ps = PorterStemmer()
data_title['title'] = [ps.stem(w) for w in data_title['title']]

In [None]:
#Getting sentiment analysis based on polarity from texts. First, making funtion, and then apply it.
def getpolarity(text): 
    return TextBlob(text).sentiment.polarity
def getsentiment(score):
    if score < 0:
        return "Negative"
    if score == 0:
        return "Neutral"
    if score > 0:
        return "Positive"
data_title['Polarity'] = data_title['title'].apply(getpolarity)
data_title['Sentiment'] = data_title['Polarity'].apply(getsentiment)

In [None]:
#Plot sentiment analysis based on title.
plt.figure(figsize=(7,7))
ax = data_title['Sentiment'].value_counts().plot(kind = 'bar')
for p in ax.patches:
    ax.annotate(np.round(p.get_height(),decimals=2),(p.get_x()+p.get_width()/2., p.get_height()),ha='center',va='center',xytext=(0, 10),textcoords='offset points')
plt.title('Sentiment based on Title', size = '15')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
#Plot positive texts from a title column.
data_pos = data_title[(data_title['Sentiment'] == 'Positive') & (data_title['title'] != "Comment")]
title_pos = list(set(data_pos['title'])) #remove duplicates
title_pos = " ".join(title_pos)
plt.rcParams['figure.figsize'] = (15,15)
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = 'white').generate(str(title_pos))

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Positive Vaccine Myths Title', size = 23, color = 'darkblue')
plt.show()

In [None]:
#Show 10 example titles that have positive sentiment. 
check_pos = pd.DataFrame(data_pos)
sortedDF = check_pos.sort_values(by=['Polarity'])
sortedDF['title'].head(10)

In [None]:
#Plot neutral sentiment texts from a title column.
data_neut = data_title[(data_title['Sentiment'] == 'Neutral') & (data_title['title'] != "Comment")]
title_neut = list(set(data_neut['title'])) #remove duplicates
title_neut = " ".join(title_neut)
plt.rcParams['figure.figsize'] = (15,15)
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = 'white').generate(str(title_neut))

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Neutral Vaccine Myths Title', size = 23, color = 'darkblue')
plt.show()

In [None]:
#Show 10 example titles that have neutral sentiment. 
check_neut = pd.DataFrame(data_neut)
sortedDF = check_neut.sort_values(by=['Polarity'])
sortedDF['title'].head(10)

In [None]:
#Plot negative texts from a title column.
data_neg = data_title[(data_title['Sentiment'] == 'Negative') & (data_title['title'] != "Comment")]
title_neg = list(set(data_neg['title'])) #remove duplicates
title_neg = " ".join(title_neg)
plt.rcParams['figure.figsize'] = (15,15)
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = 'white').generate(str(title_neg))

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Negative Vaccine Myths Title', size = 23, color = 'darkblue')
plt.show()

In [None]:
#Show 10 example titles that have negative sentiment. 
check_neg = pd.DataFrame(data_neg)
sortedDF = check_neg.sort_values(by=['Polarity'])
sortedDF['title'].head(10)

# **Sentiment Based on Content**

In [None]:
#Taking content texts that haven't non values and 
data_content = data.dropna(axis=0, subset=['body'])

In [None]:
#Delete commonly used word that prevent machine to ignore them.
data_content['body'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
#Apply remove punctuations function.
data_content['body'] = data_content['body'].apply(review_cleaning) # Apply cleanser.

In [None]:
#Process for removing the commoner morphological and inflexional endings from words in English.
data_content['body'] = [ps.stem(w) for w in data_content['body']]

In [None]:
#Getting sentiment analysis based on polarity from texts.
data_content['Polarity'] = data_content['body'].apply(getpolarity)
data_content['Sentiment'] = data_content['Polarity'].apply(getsentiment)

In [None]:
#Plot sentiment analysis based on content.
plt.figure(figsize=(7,7))
ax = data_content['Sentiment'].value_counts().plot(kind = 'bar')
for p in ax.patches:
    ax.annotate(np.round(p.get_height(),decimals=2),(p.get_x()+p.get_width()/2., p.get_height()),ha='center',va='center',xytext=(0, 10),textcoords='offset points')
plt.title('Sentiment based on Title', size = '15')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
#Plot positive texts from a content column.
data_pos = data_content[(data_content['Sentiment'] == 'Positive') & (~data_content.body.str.contains("https")) & (~data_content.body.str.contains("http"))]
title_pos = list(set(data_pos['body'])) #remove duplicates
title_pos = " ".join(title_pos)

# Plot genres.
plt.rcParams['figure.figsize'] = (15,15)
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = 'white').generate(str(title_pos))

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Positive Vaccine Myths Content', size = 23, color = 'darkblue')
plt.show()

In [None]:
#Show 10 example titles that have positive sentiment. 
check_pos = pd.DataFrame(data_pos)
sortedDF = check_pos.sort_values(by=['Polarity'])
sortedDF['body'].head(10)

In [None]:
#Plot neural texts from a content column.
data_neut = data_content[(data_content['Sentiment'] == 'Neutral') & (~data_content.body.str.contains("https")) & (~data_content.body.str.contains("http"))]
title_neut = list(set(data_neut['body'])) #remove duplicates
title_neut = " ".join(title_neut)

# Plot genres.
plt.rcParams['figure.figsize'] = (15,15)
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = 'white').generate(str(title_neut))

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Neutral Vaccine Myths Content', size = 23, color = 'darkblue')
plt.show()

In [None]:
#Show 10 example titles that have neutral sentiment. 
check_neut = pd.DataFrame(data_neut)
sortedDF = check_neut.sort_values(by=['Polarity'])
sortedDF['body'].head(10)

In [None]:
#Plot neutral texts from a content column.
data_neg = data_content[(data_content['Sentiment'] == 'Negative') & (~data_content.body.str.contains("https")) & (~data_content.body.str.contains("http"))]
title_neg = list(set(data_neg['body'])) #remove duplicates
title_neg = " ".join(title_neg)

# Plot genres.
plt.rcParams['figure.figsize'] = (15,15)
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = 'white').generate(str(title_neg))

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Negative Vaccine Myths Content', size = 23, color = 'darkblue')
plt.show()

In [None]:
#Show 10 example titles that have negative sentiment. 
check_neg = pd.DataFrame(data_neg)
sortedDF = check_neg.sort_values(by=['Polarity'])
sortedDF['body'].head(10)