# Reddit Vaccine Myths Sentiment Analysis

## Install the dependencies

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import spacy
import re
import matplotlib.pyplot as plt
!pip install spacytextblob==0.1.7
from spacytextblob.spacytextblob import SpacyTextBlob

In [None]:
df=pd.read_csv('../input/reddit-vaccine-myths/reddit_vm.csv')
df.head(20)

# Data Preprocessing
### Remove the unrequired columns and drop columns with Null values 

In [None]:
df.drop(columns=['url','created','id'], inplace=True)
df.dropna(inplace=True)
df.head()

In [None]:
# Adding the nlp english language corpus
nlp = spacy.load('en_core_web_sm')

## Removal of hyperlinks
 Regex is used to remove hyperlinks in the text body

In [None]:
df['body'] = df['body'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

In [None]:
df.dropna()
df.head()

# Perform sentiment analysis

In [None]:
# Adding spacy textblob
spacy_text_blob = SpacyTextBlob()
nlp.add_pipe(spacy_text_blob)

Remember, df.iterrows() returns a copy of the row over which it is iterated. Any changes made to the row will not be reflected in original dataframe. So, we need to refer to the row with the returned index during iteration.


https://stackoverflow.com/questions/25478528/updating-value-in-iterrow-for-pandas

In [None]:
# print(nlp.pipe_names)
for index, row in df.iterrows():
    text = row.body
    doc = nlp(text)
    subjectivity=doc._.sentiment.subjectivity
    polarity= doc._.sentiment.polarity
    print(text)
    print("\nsubjectivity:", subjectivity)
    print("polarity:",polarity)
    print("------------------------")
    df.loc[index, 'Polarity'] = polarity
    df.loc[index,'Subjectivity']=subjectivity

In [None]:
df.tail()

In [None]:
# Finding the no. of rows and columns
df.shape

In [None]:
# Extracting a part of the main dataframe for analysis
df2=df.iloc[:,3:7]
df2.tail()

## Visualising polarity vs subjectivity using Seaborn
Polarity indicates the how positive or negative is the sentecne. It is a float value lying between [-1,1].
Subjectivity refers to the extent of information it has. It lies between [0,1]. It may be factual information.
E.g., The sun rises in the east is a subjective sentence



In [None]:
text1='Earths mechanically rigid outer layer, the lithosphere, is divided into tectonic plates. These plates are rigid segments that move relative to each other at one of three boundaries types: at convergent boundaries, two plates come together; at divergent boundaries, two plates are pulled apart; and at transform boundaries, two plates slide past one another laterally. Along these plate boundaries, earthquakes, volcanic activity, mountain-building, and oceanic trench formation can occur.[115] The tectonic plates ride on top of the asthenosphere, the solid but less-viscous part of the upper mantle that can flow and move along with the plates'
doc = nlp(text1)
subjectivity=doc._.sentiment.subjectivity
polarity= doc._.sentiment.polarity
print("Subjectivity:",subjectivity,"\nPolarity:",polarity)

In [None]:
# Iterating over the rows of new dataframe df2
for index, row in df2.iterrows():
    #print(row.timestamp[0:4])
    df2.loc[index, 'Year'] = row.timestamp[0:4] # Find the year
    df2.loc[index,'Month']=row.timestamp[5:7] # Find the month
df2.head()

In [None]:
df2.shape

## Removing the cases where polarity and subjectivity are irrelevant

In [None]:
# To remove the cases where subjectivity and polarity are both 0.000
# Creating a new dataframe d4 (copy of d2)
df4=df2.copy()
c=0;
for index, row in df4.iterrows():
    if(row.Polarity==0.00 and row.Subjectivity==0.00):
        c+=1
        
print("No. of irrelevant values:",c)

In [None]:
for index, row in df4.iterrows():
    if(row.Polarity==0.00 and row.Subjectivity==0):
        df4.drop(index, inplace=True)
df4.head()

In [None]:
df4.shape
# It shows the total no. of available datapoints

# Visualisations

In [None]:
sns.set_theme(style="darkgrid") # Set a theme
sns.set_palette("summer_r",2) # Color, intensity
sns.lineplot(x="Polarity", y="Subjectivity",data=df4) 
plt.title("Polarity vs Subjectivity", size=20)
plt.show()

In [None]:
sns.set_palette("magma_r",2) # Color, intensity
sns.lmplot(x="Polarity", y="Subjectivity",data=df4, line_kws={'color': 'Blue'})
plt.title("Polarity vs Subjectivity", size=20)
plt.show()

In [None]:
# Creating a new dataframe to store polarity and month 
df5=df4[['Polarity','Subjectivity','Month']]
df5=df4[['Month','Subjectivity','Polarity']].astype('float64')
df5.info()
fig=sns.lmplot(x='Month',y='Polarity',data=df5)
plt.title("Month vs Polarity", size=20)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = ( 20,10 ))
fig=sns.scatterplot(ax=ax,x='Polarity',y='Month',data=df5)
plt.title("Month vs Polarity", size=20)
plt.show()

Here, we see that majority of tweets are neutral for each month and the polarity lies around zero (between -0.25 and 0.25) From this we know that Myths revolving around the vaccines are relatively neutral.

In [None]:
fig=sns.lmplot(x='Month',y='Subjectivity',data=df5)
plt.title("Month vs Subjectivity", size=20)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = ( 10,5 ))
sns.scatterplot(ax=ax, x='Polarity', y='Subjectivity', data=df5)
ax.set_xlabel( "Polarity" , size = 12 )
ax.set_ylabel( "Subjectivity" , size = 12 )
ax.set_title( "Polarity vs Subjectivity" , size = 24)
plt.show()

In [None]:
df5.head()

In [None]:
def getAnalysis(points):
  if points<0:
    return"Negative"
  elif points==0:
    return "Neutral"
  else:
    return "Positive"

In [None]:
pos=0;neg=0;neu=0
for index,row in df4.iterrows():
    k=getAnalysis(df4.loc[index, 'Polarity'])
    if(k=='Positive'):
        pos+=1
    elif(k=='Negative'):
        neg+=1
    else:
        neu+=1
print("Positive comments",pos)
print("Negative comments",neg)
print("Neutral comments",neu)

In [None]:
df4['Analysis']=df4['Polarity'].apply(getAnalysis)
df4.head(10)

In [None]:
sns.catplot(x='Polarity',y='Subjectivity',data=df4)

## Finding the counts of Positive, Negative and Neutral comments

In [None]:
sns.countplot(x='Analysis',data=df4)

## Finding customer engagements with reddit comments

In [None]:
df.head()

## Finding post with most replies
The column 'comms_num' indicates the number of replies on each post in subreddit r/VaccnineMyths. <br>
The column 'score' determines the number of upvotes to a post

In [None]:
df.sort_values(by=['comms_num'],ascending=False, inplace=True)
df.head(10)

In [None]:
sns.pairplot(df)

## To find how upvotes affects replies


In [None]:
ax=sns.lineplot(x='score', y='comms_num',data=df)
ax.set(xlabel="Upvotes", ylabel = "Replies",title='Representation of counts- Upvotes vs Replies')