## Setup

In [1]:
from pathlib import Path
import networkx as nx
import praw
import matplotlib.pyplot as plt
from praw.models import MoreComments
from datetime import datetime
import numpy as np
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from scipy import stats
%config InlineBackend.figure_format = 'svg'

## Obtaining Reddit Posts

In [None]:
reddit_acct = "Dangerous-Aerie9277"
reddit = praw.Reddit(
    client_id= "ZRu7WVk3cApdBo0iUrkhnA",
    client_secret= "cueQ2J_LLjOTNmVvIbFaS3ja7PSO3g",
    user_agent=f"Comment Extraction (by u/{reddit_acct})",
)

def get_posts():

    sub = reddit.subreddit('legaladvice')
    sub_type = sub.top(time_filter="month")

    post = {'post_id':[], 'utc':[],'parent_author':[], 'post_title':[],'comment_author':[], 'comment':[]}
    
    #Get posts and their comments
    for submissions in sub_type:
        submissions.comments.replace_more()
        for comment in submissions.comments.list():
            if comment.author is not None and submissions.author is not None:
                post['post_id'].append(submissions)
                if comment.parent().author is not None:
                    post['parent_author'].append(comment.parent().author.name)
                else:
                    post['parent_author'].append(submissions.author.name)
                post['post_title'].append(submissions.title)
                post['comment_author'].append(comment.author.name)
                post['comment'].append(comment.body)
                post['utc'].append(comment.created_utc)
       
    return post

In [None]:
top_df = pd.DataFrame.from_dict(get_posts())
top_df = top_df.dropna()

In [None]:
##moderators or other users who don't count as actual posters
df = top_df[top_df['parent_author']!='parsnippity']
df = df[df['parent_author']!='AutoModerator']
df = df[df['comment_author']!='AutoModerator']
df = df[df['comment_author']!='LocationBot']

In [None]:
posts = df.copy()
posts_segment = posts.copy()

## Sentiment Analysis of Comments

In [None]:
sia = SentimentIntensityAnalyzer()
posts_segment["neg"] = posts_segment["comment"].map(lambda x: sia.polarity_scores(x)['neg']*(-1))
posts_segment["pos"] = posts_segment["comment"].map(lambda x: sia.polarity_scores(x)['pos'])
posts_segment.head()

In [None]:
print(posts_segment['neg'].mean())
print(posts_segment['pos'].mean())

In [None]:
#Get maximum value of absolute values of positive and negative sentiment scores - determines how post will be labeled
v = posts_segment[['pos','neg']].values
posts_segment['sen_value'] = v[range(len(v)), np.abs(v).argmax(axis=1)]
posts_segment['sen'] = posts_segment[['pos','neg']].abs().idxmax(axis=1)

## Create Network of Posts
Nodes are users and edges are comments

In [None]:
import networkx as nx
G = nx.from_pandas_edgelist(posts_segment, source='comment_author', target='parent_author', edge_attr = ['sen_value','sen'], create_using=nx.DiGraph())

sen=nx.get_edge_attributes(G,'sen')
for i,j in G.edges:
    if sen[(i,j)] == 'neg':
        G.edges[i,j]["color"] = "red"
    else:
        G.edges[i,j]["color"] = "green"
        
Gcc = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)
G0 = G.subgraph(Gcc[0])

pos = nx.spring_layout(G0)

edges = G0.edges()

#Edges colored based on if sentiment of comment is positive or negative
color_list = [attrs["color"] for i,j,attrs in G0.edges(data=True)]

In [None]:
nx.draw_networkx_edges(G0,pos,edge_color=color_list)
nx.draw_networkx_nodes(G0, pos, alpha=0.3, node_size = 10)
plt.show()

## Community Detection

In [None]:
communities_generator = nx.community.girvan_newman(G0)
top_level_communities = list(next(communities_generator))

In [None]:
#for each community, get average ratio of positive to negative posts
import statistics
comm_ratios = []
total_lists = [[],[]]
for i in range(len(top_level_communities)):
    num_pos = 1
    num_neg = 1
    comm_list = list(top_level_communities[i])
    num_comments = 0
    for j in comm_list:
        sen_list = list(posts_segment.loc[posts_segment['comment_author'] == j, 'sen'])
        num_comments += len(sen_list)
        for s in sen_list:
            if s=="pos":
                num_pos +=1
            else:
                num_neg +=1
        total_lists[i].append(round(num_pos/num_neg,5))
    print(num_comments)
    comm_ratios.append(statistics.mean(total_lists[i]))

In [None]:
# Extract the sentiment scores for each community:
comm1_values = total_lists[0]
comm2_values = total_lists[1]

# Degrees of freedom  
from statistics import mean
dof = min(len(comm1_values),len(comm2_values)) - 1

print(mean(comm1_values) - mean(comm2_values))
## Using SciPy Package  
t_stat, p_val = stats.ttest_ind(comm1_values, comm2_values, equal_var = False) 
print(dof)
print("t-statistic = " + str(t_stat))  
print("p-value = " + str(p_val))

alpha = 0.05
if p_val < alpha:
    print("Reject the null hypothesis; there is a significant difference between the sentiment number ratios of community 1 and community 2.")
else:
    print("Fail to reject the null hypothesis; there is no significant difference between the sentiment number ratios of community 1 and community 2.")

In [None]:
#for each community, get averages sentiment values
from statistics import mean 
comm_value_avgs = []
total_value_lists = [[],[]]
#for each community, get average sentiment value
for i in range(len(top_level_communities)):
    total_value = 0
    num_comments = 0
    comm_list = list(top_level_communities[i])
    #for each user, get their average sentiment score
    for j in comm_list:
        sen_value_list = list(posts_segment.loc[posts_segment['comment_author'] == j, 'sen_value'])
        if len(sen_value_list)==0:
            pass
        else:
            mean_value = mean(sen_value_list)
            num_comments +=1
        total_value+= mean_value
        total_value_lists[i].append(mean_value)
    comm_value_avgs.append(total_value/num_comments)

In [None]:
# Extract the sentiment scores for each community:
comm1_values = total_value_lists[0]
comm2_values = total_value_lists[1]

dof = min(len(comm1_values),len(comm2_values)) - 1

print(mean(comm1_values) - mean(comm2_values))
## Using SciPy Package  
t_stat, p_val = stats.ttest_ind(comm1_values, comm2_values, equal_var = False) 
print(dof)
print("t-statistic = " + str(t_stat))  
print("p-value = " + str(p_val))

alpha = 0.05
if p_val < alpha:
    print("Reject the null hypothesis; there is a significant difference between the sentiment values of community 1 and community 2.")
else:
    print("Fail to reject the null hypothesis; there is no significant difference between the sentiment values of community 1 and community 2.")

## Time Series Visualizations

In [None]:
posts_segment['date_time'] = posts_segment['utc'].map(lambda x: datetime.fromtimestamp(x))
posts_segment['day'] = posts_segment['utc'].map(lambda x: datetime.fromtimestamp(x).day)
posts_segment['week'] = posts_segment['utc'].map(lambda x: datetime.fromtimestamp(x).isocalendar()[1])
posts_segment.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
ax = sns.histplot(posts_segment["date_time"], bins=30)
plt.xticks(rotation=45)
ax.set_xticks(["2023-09-29","2023-10-06","2023-10-13","2023-10-20","2023-10-27"])
plt.show()

In [None]:
sns.countplot(data=posts_segment,x='week',hue='sen')