In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import re
import os
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Let's see the data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df= pd.read_csv('/kaggle/input/reddit-vaccine-myths/reddit_vm.csv')
df.head(5)

In [None]:
df.info()

In [None]:
df.isnull().sum()

so many null values in url and body

statistically impact

In [None]:
df.describe()

Extracting year and month from timestamp

In [None]:
df['year'] = pd.DatetimeIndex(df['timestamp']).year
df['month'] = pd.DatetimeIndex(df['timestamp']).month

In [None]:
df.head()

Pearson correlation to know the best correlated columns , then we can drop the most in score

In [None]:
pearsoncorrelation = df.corr(method = 'pearson')
sns.heatmap(pearsoncorrelation,
           xticklabels = pearsoncorrelation.columns,
           yticklabels = pearsoncorrelation.columns,
           cmap = 'RdBu_r',
           annot = True,
           linewidth = 0.5)

Some visualizations 

In [None]:
sns.heatmap(df.isnull())

In [None]:
sns.kdeplot(df['comms_num'],shade = True , color = 'Blue')

In [None]:
sns.countplot(data = df , x = 'year')

In [None]:
sns.countplot(data = df, x = 'month')

In [None]:
sns.countplot(data = df, x = 'month' , hue = 'year')
plt.style.use('seaborn-poster')

In [None]:
sns.pairplot(df)

Dropping the columns not required

In [None]:
df.drop(columns=['comms_num','id','url','created','timestamp'], inplace=True)
df.head()

setting the bodies in lower cases

In [None]:
df['title'] = df['title'].astype(str)
df['body'] = df['body'].astype(str)

df['title'] = df['title'].apply(lambda word : " ".join(word.lower() for word in word.split()))
df['body'] = df['body'].apply(lambda word: " ".join(word.lower() for word in word.split()))

removing the patterns if any

In [None]:
def remove_pattern(input_txt , pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, " ", input_txt)
    return input_txt
df.head()

In [None]:
df['clean_title'] = np.vectorize(remove_pattern)(df['title'],'@[\w]')
df['clean_body'] = np.vectorize(remove_pattern)(df['body'],'@[\w]')

In [None]:
#df['clean_title'] = df['clean_title'].str.replace("[^a-za-z#]","")
#df['clean_body'] = df['clean_body'].str.replace("[^a-za-z#]","")

In [None]:
df.head()

In [None]:
df['clean_title'] = df['clean_title'].apply(lambda x : " ".join([w for w in x.split() if len(w)>3]))
df['clean_body'] = df['clean_body'].apply(lambda x : " ".join([w for w in x.split() if len(w)>3]))



In [None]:
df.head()

Tokenizing the cleaned columns , will be working on that only

In [None]:
tokenized_title = df['clean_title'].apply(lambda x:x.split())
tokenized_body = df['clean_body'].apply(lambda x:x.split())

Stemming the similar words

In [None]:
from nltk.stem.porter import PorterStemmer 
stemmer = PorterStemmer()

tokenized_title = tokenized_title.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_body = tokenized_body.apply(lambda sentence : [stemmer.stem(word)for word in sentence])

In [None]:
for i in range(len(tokenized_title)):
    tokenized_title[i] = " ".join(tokenized_title[i])
    
df['clean_title'] = tokenized_title

In [None]:
for i in range(len(tokenized_body)):
    tokenized_body[i] = " ".join(tokenized_body[i])
    
df['clean_body'] = tokenized_body

Don't want the duplicates

In [None]:
df.drop(columns = ['title','body'],inplace = True)


Need to improve these wordclouds now

In [None]:
all_words = " ".join([sentence for sentence in df['clean_title']])

from wordcloud import WordCloud
wordcloud = WordCloud(width = 800 , height = 500 , random_state = 42 , max_font_size = 100).generate(all_words)

plt.figure(figsize = (15,8))
plt.imshow(wordcloud , interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
all_words = " ".join([sentence for sentence in df['clean_body']])

from wordcloud import WordCloud
wordcloud = WordCloud(width = 800 , height = 500 , random_state = 42 , max_font_size = 100).generate(all_words)

plt.figure(figsize = (15,8))
plt.imshow(wordcloud , interpolation = 'bilinear')
plt.axis('off')
plt.show()

Want to know the most impactful using the score of bodies

In [None]:
all_words = "".join([sentence for sentence in df['clean_body'][df['score']>=5]])

wordcloud = WordCloud(width = 800 , height = 500 , random_state = 42 , max_font_size = 100).generate(all_words)

plt.figure(figsize = (15,8))
plt.imshow(wordcloud , interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
all_words = "".join([sentence for sentence in df['clean_body'][df['score']<5]])

wordcloud = WordCloud(width = 800 , height = 500 , random_state = 42 , max_font_size = 100).generate(all_words)

plt.figure(figsize = (15,8))
plt.imshow(wordcloud , interpolation = 'bilinear')
plt.axis('off')
plt.show()