In [None]:
import os, pathlib, sys
from fnmatch import fnmatch
import re
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json

# LIWC vs. Vader.
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import liwc
import nltk
nltk.download('vader_lexicon')


---
# LIWC - Linguistic Inquiry and Word Count 
---

In [None]:
parse, category_names = liwc.load_token_parser('../data/LIWC2007_English100131.dic')

---
# Vader Sentiment
---

In [None]:
# Create an instance of the Vader sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# analyzer.polarity_scores(text)
# {'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.8393}

# Function to create the sentiment dataframe from Submissions.

# -------------------------------------------------
# Calc Sentiment Scores for each Submission.
# Submissions
# -------------------------------------------------
def create_sentiment_submissions(df, user_base, save_path):
    # new sentiment dataframe 
    columns = ["subreddit", "unique_authors", "author", "selftext", "negative", "neutral", "positive", "compound"]
    sentiment_df = pd.DataFrame(columns=columns)
    for index, row in df.iterrows():
        score = analyzer.polarity_scores(row.selftext)
        new_row = {"subreddit": row.subreddit, "unique_authors": user_base[row.subreddit], "author": row.author, "selftext": row.selftext, "negative": score['neg'], "neutral": score['neu'], "positive": score['pos'], "compound": score['compound']}
        sentiment_df = pd.concat([sentiment_df, pd.DataFrame([new_row])], ignore_index=True)
        sentiment_df.to_csv(save_path)
        # display(sentiment_df.sample())
    return sentiment_df

# -------------------------------------------------
# Calc Avg. Sentiment Scores for each Community.
# Submissions
# -------------------------------------------------
def create_avg_sentiment_submissions(df, save_path):
    list_of_subreddits = list(df.subreddit.unique())
    # new AVG sentiment dataframe 
    columns = ["subreddit", "unique_authors", "avg_negative", "avg_neutral", "avg_positive", "avg_compound", "sentiment"]
    avg_sentiment_df = pd.DataFrame(columns=columns)
    for subreddit in list_of_subreddits:
        # decide sentiment as positive, negative and neutral
        sentiment = ""
        # Average Sentiment Compound per Subreddit.
        sentiment_score = df[df['subreddit'] == subreddit].compound.mean()
        if sentiment_score >= 0.05 :
            sentiment = "Positive"
     
        elif sentiment_score <= - 0.05 :
            sentiment = "Negative"
        else:
            sentiment = "Neutral"
        
        new_row = {"subreddit": subreddit, "unique_authors": df[df['subreddit'] == subreddit].community_size.median(), "avg_negative": df[df['subreddit'] == subreddit].negative.mean(), "avg_neutral": df[df['subreddit'] == subreddit].neutral.mean(), "avg_positive": df[df['subreddit'] == subreddit].positive.mean(), "avg_compound": sentiment_score, "sentiment": sentiment}
        avg_sentiment_df = pd.concat([avg_sentiment_df, pd.DataFrame([new_row])], ignore_index=True)
        avg_sentiment_df.to_json(save_path)
        # display(avg_sentiment_df.sample())
    return avg_sentiment_df

In [None]:
df = pd.read_csv("../../submissions_preprocessed.csv", index_col=0)


In [None]:
df[df['subreddit'] == 'AntiPornVideos']

In [None]:
user_base = df.groupby(by="subreddit")['author'].nunique()
user_base['AdultSelfHarm']

In [None]:
# Create an instance of the Vader sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# analyzer.polarity_scores(text)
# {'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.8393}

# Function to create the sentiment dataframe.
def create_sentiment(df):
    # new sentiment dataframe 
    columns = ["subreddit", "community_size", "author", "selftext", "negative", "neutral", "positive", "compound"]
    sentiment_df = pd.DataFrame(columns=columns)
    for index, row in df.iterrows():
        score = analyzer.polarity_scores(row.selftext)
        new_row = {"subreddit": row.subreddit, "community_size": user_base[row.subreddit], "author": row.author, "selftext": row.selftext, "negative": score['neg'], "neutral": score['neu'], "positive": score['pos'], "compound": score['neg']}
        sentiment_df = pd.concat([sentiment_df, pd.DataFrame([new_row])], ignore_index=True)
        # display(sentiment_df.sample())
    return sentiment_df
    


df = pd.read_csv("../../submissions_preprocessed.csv", index_col=0)

sentiment_df = create_sentiment(df)

In [None]:
sentiment_df.to_json('../data/CommunityInfo.json')


In [None]:
test = pd.read_json('../data/CommunityInfo.json')
test

---
# Average Information per Community
---

In [None]:
df = pd.read_json('../data/CommunityInfo.json')
df

In [None]:
save_path = "../data/avg_community_sentiment.json"
avg_sentiment_df = create_avg_sentiment_submissions(df, save_path)

display(avg_sentiment_df)

In [None]:
df = pd.read_json("../data/avg_community_sentiment.json")
pd.set_option('display.max_rows', None)
df