# Introduction



We capture here discussions from the **r/Politics** subreddit.

Dataset is updated daily but because it is not a very dynamic subredit, you will not probably see dramatic changes on a daily basis.



# Analysis preparation

We initialize the packages that we will use in the analysis.

In [None]:
import numpy as np 
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import warnings
warnings.simplefilter("ignore")

We read and glimpse the data.

In [None]:
data_df = pd.read_csv("/kaggle/input/politics-on-reddit/reddit_politics.csv")

In [None]:
data_df.head()

We also look to things like data quality, for example missing data.

In [None]:
data_df.info()

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_data(data_df)

Body of posts is missing in approximatively half of the data.

# Data visualization


We will use wordclouds to identify the most frequent words in the titles and body of the posts.

We will remove frequent used words, like "vaccine", "vaccination", some bad words as well as "Comment" which is a keyword we used for the title of comments (vs. messages, that have also a title)

In [None]:
def show_wordcloud(data, title=""):
    text = " ".join(t for t in data.dropna())
    stopwords = set(STOPWORDS)
    stopwords.update(["t", "co", "https", "amp", "U", "fuck", "fucking", "Comment", "vaccination", "vaccine", "vaccines", "vaccinate", "vaccinated"])
    wordcloud = WordCloud(stopwords=stopwords, scale=4, max_font_size=50, max_words=500,background_color="black").generate(text)
    fig = plt.figure(1, figsize=(16,16))
    plt.axis('off')
    fig.suptitle(title, fontsize=20)
    fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

## Title

In [None]:
show_wordcloud(data_df['title'], title = 'Prevalent words in titles')

## Body

In [None]:
show_wordcloud(data_df['body'], title = 'Prevalent words in post bodies')

# Sentiment analysis

## With nltk SentimentIntensityAnalyzer

In [None]:
# borrowed from https://www.kaggle.com/pashupatigupta/sentiments-transformer-vader-embedding-bert
sia = SentimentIntensityAnalyzer()
def find_sentiment(post):
    if sia.polarity_scores(post)["compound"] > 0:
        return "Positive"
    elif sia.polarity_scores(post)["compound"] < 0:
        return "Negative"
    else:
        return "Neutral"       

In [None]:
def plot_sentiment(df, feature, title):
    counts = df[feature].value_counts()
    percent = counts/sum(counts)

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

    counts.plot(kind='bar', ax=ax1, color='green')
    percent.plot(kind='bar', ax=ax2, color='blue')
    ax1.set_ylabel(f'Counts : {title} sentiments', size=12)
    ax2.set_ylabel(f'Percentage : {title} sentiments', size=12)
    plt.suptitle(f"Sentiment analysis: {title}")
    plt.tight_layout()
    plt.show()

### Title

In [None]:
data_df['title_sentiment'] = data_df['title'].apply(lambda x: find_sentiment(x))
plot_sentiment(data_df, 'title_sentiment', 'Title')

In [None]:
show_wordcloud(data_df.loc[data_df['title_sentiment']=='Positive', 'title'], title = 'Prevalent words in titles (Positive sentiment)')

In [None]:
show_wordcloud(data_df.loc[data_df['title_sentiment']=='Negative', 'title'], title = 'Prevalent words in titles (Negative sentiment)')

In [None]:
show_wordcloud(data_df.loc[data_df['title_sentiment']=='Neutral', 'title'], title = 'Prevalent words in titles (Neutral sentiment)')

### Body

In [None]:
df = data_df.loc[~data_df.body.isna()]
df['body_sentiment'] = df['body'].apply(lambda x: find_sentiment(x))
plot_sentiment(df, 'body_sentiment', 'Body')

In [None]:
show_wordcloud(df.loc[df['body_sentiment']=='Positive', 'body'], title = 'Prevalent words in body (Positive sentiment)')

In [None]:
show_wordcloud(df.loc[df['body_sentiment']=='Negative', 'body'], title = 'Prevalent words in body (Negative sentiment)')

In [None]:
show_wordcloud(df.loc[df['body_sentiment']=='Neutral', 'body'], title = 'Prevalent words in body (Neutral sentiment)')

## With TextBlob

In [None]:
def find_sentiment_polarity_textblob(post):
    blob = TextBlob(post)
    polarity = 0
    for sentence in blob.sentences:
        polarity += sentence.sentiment.polarity
    return polarity

def find_sentiment_subjectivity_textblob(post):
    blob = TextBlob(post)
    subjectivity = 0
    for sentence in blob.sentences:
        subjectivity += sentence.sentiment.subjectivity
    return subjectivity

In [None]:
data_df['title_sentiment_polarity'] = data_df['title'].apply(lambda x: find_sentiment_polarity_textblob(x))
data_df['title_sentiment_subjectivity'] = data_df['title'].apply(lambda x: find_sentiment_subjectivity_textblob(x))

In [None]:
def plot_sentiment_textblob(df, feature, title):
    polarity = df[feature+'_sentiment_polarity']
    subjectivity = df[feature+'_sentiment_subjectivity']

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

    polarity.plot(kind='kde', ax=ax1, color='magenta')
    subjectivity.plot(kind='kde', ax=ax2, color='green')
    ax1.set_ylabel(f'Sentiment polarity : {title}', size=12)
    ax2.set_ylabel(f'Sentiment subjectivity: {title}', size=12)
    plt.suptitle(f"Sentiment analysis (polarity & subjectivity): {title}")
    plt.tight_layout()
    plt.show()

In [None]:
plot_sentiment_textblob(data_df, "title", 'Title')

In [None]:
df['body_sentiment_polarity'] = df['body'].apply(lambda x: find_sentiment_polarity_textblob(x))
df['body_sentiment_subjectivity'] = df['body'].apply(lambda x: find_sentiment_subjectivity_textblob(x))

In [None]:
plot_sentiment_textblob(df, "body", 'Body')