In [3]:
import os, pathlib, sys
from fnmatch import fnmatch
import re
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json

# LIWC vs. Vader.
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import liwc
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\taumuell\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

---
# LIWC - Linguistic Inquiry and Word Count 
---

In [None]:
parse, category_names = liwc.load_token_parser('../data/LIWC2007_English100131.dic')

---
# Vader Sentiment
---

In [9]:
# Create an instance of the Vader sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# analyzer.polarity_scores(text)
# {'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.8393}

# Function to create the sentiment dataframe from Submissions.

# -------------------------------------------------
# Calc Sentiment Scores for each Submission.
# Submissions
# -------------------------------------------------
def create_sentiment_submissions(df, user_base, save_path):
    # new sentiment dataframe 
    columns = ["subreddit", "unique_authors", "author", "selftext", "negative", "neutral", "positive", "compound"]
    sentiment_df = pd.DataFrame(columns=columns)
    for index, row in df.iterrows():
        score = analyzer.polarity_scores(row.selftext)
        new_row = {"subreddit": row.subreddit, "unique_authors": user_base[row.subreddit], "author": row.author, "selftext": row.selftext, "negative": score['neg'], "neutral": score['neu'], "positive": score['pos'], "compound": score['compound']}
        sentiment_df = pd.concat([sentiment_df, pd.DataFrame([new_row])], ignore_index=True)
        sentiment_df.to_csv(save_path)
        # display(sentiment_df.sample())
    return sentiment_df

# -------------------------------------------------
# Calc Avg. Sentiment Scores for each Community.
# Submissions
# -------------------------------------------------
def create_avg_sentiment_submissions(df, save_path):
    list_of_subreddits = list(df.subreddit.unique())
    # new AVG sentiment dataframe 
    columns = ["subreddit", "unique_authors", "avg_negative", "avg_neutral", "avg_positive", "avg_compound", "sentiment"]
    avg_sentiment_df = pd.DataFrame(columns=columns)
    for subreddit in list_of_subreddits:
        # decide sentiment as positive, negative and neutral
        sentiment = ""
        # Average Sentiment Compound per Subreddit.
        sentiment_score = df[df['subreddit'] == subreddit].compound.mean()
        if sentiment_score >= 0.05 :
            sentiment = "Positive"
     
        elif sentiment_score <= - 0.05 :
            sentiment = "Negative"
        
        new_row = {"subreddit": subreddit, "unique_authors": df[df['subreddit'] == subreddit].community_size.median(), "avg_negative": df[df['subreddit'] == subreddit].negative.mean(), "avg_neutral": df[df['subreddit'] == subreddit].neutral.mean(), "avg_positive": df[df['subreddit'] == subreddit].positive.mean(), "avg_compound": sentiment_score, "sentiment": sentiment}
        avg_sentiment_df = pd.concat([avg_sentiment_df, pd.DataFrame([new_row])], ignore_index=True)
        avg_sentiment_df.to_json(save_path)
        # display(avg_sentiment_df.sample())
    return avg_sentiment_df

In [None]:
df = pd.read_csv("../../submissions_preprocessed.csv", index_col=0)


In [None]:
df[df['subreddit'] == 'AntiPornVideos']

In [None]:
user_base = df.groupby(by="subreddit")['author'].nunique()
user_base['AdultSelfHarm']

In [None]:
# Create an instance of the Vader sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# analyzer.polarity_scores(text)
# {'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.8393}

# Function to create the sentiment dataframe.
def create_sentiment(df):
    # new sentiment dataframe 
    columns = ["subreddit", "community_size", "author", "selftext", "negative", "neutral", "positive", "compound"]
    sentiment_df = pd.DataFrame(columns=columns)
    for index, row in df.iterrows():
        score = analyzer.polarity_scores(row.selftext)
        new_row = {"subreddit": row.subreddit, "community_size": user_base[row.subreddit], "author": row.author, "selftext": row.selftext, "negative": score['neg'], "neutral": score['neu'], "positive": score['pos'], "compound": score['neg']}
        sentiment_df = pd.concat([sentiment_df, pd.DataFrame([new_row])], ignore_index=True)
        # display(sentiment_df.sample())
    return sentiment_df
    


df = pd.read_csv("../../submissions_preprocessed.csv", index_col=0)

sentiment_df = create_sentiment(df)

In [None]:
sentiment_df.to_json('../data/CommunityInfo.json')


In [None]:
test = pd.read_json('../data/CommunityInfo.json')
test

---
# Average Information per Community
---

In [6]:
df = pd.read_json('../data/CommunityInfo.json')
df

Unnamed: 0,subreddit,community_size,author,selftext,negative,neutral,positive,compound
0,addiction,3954,jrizos,This person has been using for roughly 10 year...,0.058,0.792,0.150,0.9604
1,addiction,3954,FlashGameAddict,I'm addicted to online games. Most recently I'...,0.129,0.741,0.131,0.0233
2,addiction,3954,themarknessmonster,"I think I might be addicted to light, or, more...",0.047,0.917,0.036,-0.4767
3,addiction,3954,jmc726,"For anyone not familiar with Tramadol, it's a ...",0.104,0.825,0.071,-0.9555
4,addiction,3954,PetiePal,Hey guys. I've got a friend who recently came ...,0.187,0.743,0.070,-0.9978
...,...,...,...,...,...,...,...,...
344071,SelfHate,356,throw_away_the_panda,I'm sat at my desk again and I'm unable to kee...,0.225,0.724,0.050,-0.9858
344072,SelfHate,356,twisted-spirit,"The more I try to make friends, or pretend I'm...",0.117,0.718,0.164,0.8337
344073,SelfHate,356,DedKulak1917,Recently cheated on my fiancé. We are in the p...,0.158,0.695,0.147,-0.4198
344074,SelfHate,356,edensrotting,"I'm a self destructive person and honestly, i ...",0.242,0.535,0.223,-0.7457


In [10]:
save_path = "../data/avg_community_sentiment.json"
avg_sentiment_df = create_avg_sentiment_submissions(df, save_path)

display(avg_sentiment_df)

Unnamed: 0,subreddit,unique_authors,avg_negative,avg_neutral,avg_positive,avg_compound,sentiment
0,addiction,3954.0,0.107550,0.767491,0.124959,0.057478,Positive
1,SMARTRecovery,87.0,0.070488,0.766888,0.162680,0.458270,Positive
2,AtheistTwelveSteppers,86.0,0.046739,0.825306,0.128000,0.478459,Positive
3,secularsobriety,31.0,0.068878,0.784829,0.146293,0.342949,Positive
4,recovery,601.0,0.094130,0.764974,0.140923,0.235740,Positive
...,...,...,...,...,...,...,...
83,MMFB,2849.0,0.140617,0.735199,0.124187,-0.158414,Negative
84,bulimia,1524.0,0.126585,0.759874,0.113079,-0.103977,Negative
85,BodyDysmorphia,1688.0,0.138453,0.732403,0.129150,-0.069987,Negative
86,BodyAcceptance,624.0,0.106830,0.747341,0.145825,0.272045,Positive


In [11]:
df = pd.read_json(save_path)
df

Unnamed: 0,subreddit,unique_authors,avg_negative,avg_neutral,avg_positive,avg_compound,sentiment
0,addiction,3954,0.107550,0.767491,0.124959,0.057478,Positive
1,SMARTRecovery,87,0.070488,0.766888,0.162680,0.458270,Positive
2,AtheistTwelveSteppers,86,0.046739,0.825306,0.128000,0.478459,Positive
3,secularsobriety,31,0.068878,0.784829,0.146293,0.342949,Positive
4,recovery,601,0.094130,0.764974,0.140923,0.235740,Positive
...,...,...,...,...,...,...,...
83,MMFB,2849,0.140617,0.735199,0.124187,-0.158414,Negative
84,bulimia,1524,0.126585,0.759874,0.113079,-0.103977,Negative
85,BodyDysmorphia,1688,0.138453,0.732403,0.129150,-0.069987,Negative
86,BodyAcceptance,624,0.106830,0.747341,0.145825,0.272045,Positive
