# Reddit Vaccine Myths

### Sentiment Analyze with Vader

In [None]:
pip install nltk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment import vader
import re
import nltk
import seaborn as sns
nltk.download('vader_lexicon')

In [None]:
dados = pd.read_csv('../input/reddit-vaccine-myths/reddit_vm.csv')
dados.head()

### Cleaning Data

In [None]:
dados.drop(['title','id','url','created'], axis=1, inplace=True)

In [None]:
dados['timestamp'] = dados['timestamp'].str.split(' ')

In [None]:
dados['date'] = dados['timestamp'].str.slice(0,1)
dados['hour'] = dados['timestamp'].str.slice(1)

In [None]:
x = []
for n in dados['date']:
  x.append(str(n)[1:-1])

In [None]:
dados['date'] = x

In [None]:
y = []
for z in dados['hour']:
  y.append(str(z)[1:-1])

In [None]:
dados['hour'] = y

In [None]:
dados.drop('timestamp', axis=1, inplace=True)

In [None]:
no_aps = []
for l in dados['date']:
  no_aps.append(l.replace("'", ""))

In [None]:
dados['date'] = no_aps

In [None]:
no_aps = []
for j in dados['hour']:
  no_aps.append(j.replace("'", ""))

In [None]:
dados['hour'] = no_aps

In [None]:
dados['date'] = pd.to_datetime(dados['date'], format="%Y-%m-%d")
dados['hour'] = pd.to_datetime(dados['hour'], format="%H:%M:%S")

In [None]:
dados['year'] = dados['date'].map(lambda x: x.strftime("%Y"))
dados['month'] = dados['date'].map(lambda x: x.strftime("%m"))
dados['hour'] = dados['hour'].map(lambda x: x.strftime("%H"))

In [None]:
dados.dropna(inplace=True)

In [None]:
dados['body'] = dados['body'].str.lower()
dados['body'] = dados['body'].apply(lambda x: re.sub(r'[\@\:\_\.\;\#\"\'\$\%\&\/]', "", str(x)))

In [None]:
dados['body'] = dados['body'].apply(lambda x: x.replace("https", ""))

In [None]:
dados.head()

### Sentiment Analyze

In [None]:
sentiment = vader.SentimentIntensityAnalyzer()

In [None]:
sentiments = [sentiment.polarity_scores(i) for i in dados['body'].values]

In [None]:
dados['Negative Score'] = [i['neg'] for i in sentiments]
dados['Positivo Score'] = [i['pos'] for i in sentiments]
dados['Neuter Score'] = [i['neu'] for i in sentiments]
dados['Compound Score'] = [i['compound'] for i in sentiments]

In [None]:
dados.head()

In [None]:
score = dados['Compound Score'].values

In [None]:
t = []
for v in score:
  if v >= 0.05:
    t.append('Positive')
  elif v <= -0.05:
    t.append('Negative')
  else:
    t.append("Neuter")

dados['Sentimen'] = t

In [None]:
dados.head()

In [None]:
dados['Sentimen'].value_counts().plot(kind='bar')

In [None]:
sns.histplot(dados['Compound Score'], bins=11)

In [None]:
dados = pd.concat([dados, pd.get_dummies(dados['Sentimen'])], axis=1)
dados.head()

In [None]:
dados2 = dados.groupby('year').mean()
dados2

In [None]:
ax, fig = plt.subplots(figsize=(12,6))
plt.plot(dados2.index, dados2['Negative Score'], label='Negative')
plt.plot(dados2.index, dados2['Neuter Score'], label='Neuter')
plt.plot(dados2.index, dados2['Positivo Score'], label='Positive')
fig.spines['right'].set_visible(False)
fig.spines['top'].set_visible(False)
plt.title('Mean Composition by Year', fontsize=12, color='grey', style='oblique', fontweight='bold')
plt.xticks(color='grey', style='oblique', fontweight='bold')
plt.yticks(color='grey', style='oblique', fontweight='bold')
plt.xlabel("Year", fontsize=15, color='grey', style='oblique', fontweight='bold')
plt.ylabel("Composition", fontsize=15, color='grey', style='oblique', fontweight='bold')
plt.legend(fontsize=12)
plt.show()

In [None]:
dados3 = dados.groupby('hour').mean()
dados3.head()

In [None]:
ax, fig = plt.subplots(figsize=(12,6))
plt.plot(dados3.index, dados3['Negative Score'], label='Negative')
plt.plot(dados3.index, dados3['Neuter Score'], label='Neuter')
plt.plot(dados3.index, dados3['Positivo Score'], label='Positive')
fig.spines['right'].set_visible(False)
fig.spines['top'].set_visible(False)
plt.title('Mean Composition by Hour', fontsize=12, color='grey', style='oblique', fontweight='bold')
plt.xticks(color='grey', style='oblique', fontweight='bold')
plt.yticks(color='grey', style='oblique', fontweight='bold')
plt.xlabel("Year", fontsize=15, color='grey', style='oblique', fontweight='bold')
plt.ylabel("Composition", fontsize=15, color='grey', style='oblique', fontweight='bold')
plt.legend(fontsize=12)
plt.show()

In [None]:
dados4 = dados.groupby('month').mean()
dados4.head()

In [None]:
ax, fig = plt.subplots(figsize=(12,6))
plt.plot(dados4.index, dados4['Negative Score'], label='Negative')
plt.plot(dados4.index, dados4['Neuter Score'], label='Neuter')
plt.plot(dados4.index, dados4['Positivo Score'], label='Positive')
fig.spines['right'].set_visible(False)
fig.spines['top'].set_visible(False)
plt.title('Mean Composition by Month', fontsize=12, color='grey', style='oblique', fontweight='bold')
plt.xticks(color='grey', style='oblique', fontweight='bold')
plt.yticks(color='grey', style='oblique', fontweight='bold')
plt.xlabel("Year", fontsize=15, color='grey', style='oblique', fontweight='bold')
plt.ylabel("Composition", fontsize=15, color='grey', style='oblique', fontweight='bold')
plt.legend(fontsize=12)
plt.show()