# Imports & Constants

In [1]:
# Imports
import json
import os
import pickle
from collections import defaultdict, deque
from dataclasses import asdict
from math import log, sqrt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pandas as pd
import altair as alt
import networkx as nx
from pyvis.network import Network
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from dtos.comment import Comment, json_to_comment
from dtos.post import Post
from dtos.submission import Submission, json_to_submission


alt.data_transformers.disable_max_rows()

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /Users/siddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/siddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/siddy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Constants
DATA_DIR = 'data/'
assert os.path.isdir(DATA_DIR) is True

SUBMISSIONS_FILE = DATA_DIR + 'InvestmentClub_submissions.json'
COMMENTS_FILE = DATA_DIR + 'InvestmentClub_comments.json'

SCHEMAS_DIR = 'schemas/'
os.makedirs(SCHEMAS_DIR, exist_ok=True)
SUBMISSIONS_SCHEMA_FILE = SCHEMAS_DIR + 'submissions.json'
COMMENTS_SCHEMA_FILE = SCHEMAS_DIR + 'comments.json'

SUBMISSION_ID_PREFIX = 't3_'

OBJECTS_DIR = 'objects/'
os.makedirs(OBJECTS_DIR, exist_ok=True)
SUBMISSIONS_OBJECTS_FILE = OBJECTS_DIR + 'submissions.pkl'
COMMENTS_OBJECTS_FILE = OBJECTS_DIR + 'comments.pkl'
POSTS_OBJECTS_FILE = OBJECTS_DIR + 'posts.pkl'

GRAPHS_DIR = 'graphs/'
os.makedirs(GRAPHS_DIR, exist_ok=True)
INTERACTIONS_THRESHOLD = 5
POSTS_THRESHOLD = 1

CHARTS_DIR = 'charts/'
os.makedirs(CHARTS_DIR, exist_ok=True)
NUMBER_OF_REMOVAL_STEPS = 10
Z_SCORE_MIN_MESSAGES = 1
VIRALITY_MIN_CASCADE_SIZE = 10

DAYS_OF_THE_WEEK = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
TOP_N_USERS = 15
NUM_CLUSTERS = 20
TOP_N_TERMS = 5

# Data Preparation

In [3]:
# Create Schema Objects
def parse_object(parent: dict, obj: dict) -> None:
    for key, value in obj.items():
        if parent.get(key):
            continue

        if isinstance(value, dict):
            parent[key] = {}
            parse_object(parent[key], value)
        elif isinstance(value, list):
            if not value:
                parent[key] = {'type': 'list', 'items': []}
            elif isinstance(value[0], dict):
                parent[key] = {'type': 'list', 'items': {}}
                for val in value:
                    parse_object(parent[key]['items'], val)
            else:
                parent[key] = {'type': 'list', 'items': type(value[0]).__name__}
        else:
            parent[key] = {
                'type': type(value).__name__ if value is not None else None,
                # 'example': value
            }

SUBMISSIONS_SCHEMA = dict()
with open(SUBMISSIONS_FILE, 'r') as file:
    data = json.load(file)
    for obj in data:
        parse_object(SUBMISSIONS_SCHEMA, obj)
    with open(SUBMISSIONS_SCHEMA_FILE, 'w') as file:
        json.dump(SUBMISSIONS_SCHEMA, file)

COMMENTS_SCHEMA = dict()
with open(COMMENTS_FILE, 'r') as file:
    data = json.load(file)
    for obj in data:
        parse_object(COMMENTS_SCHEMA, obj)
    with open(COMMENTS_SCHEMA_FILE, 'w') as file:
        json.dump(COMMENTS_SCHEMA, file)

In [4]:
# Create and Save Objects
with open(SUBMISSIONS_FILE, 'r') as infile:
    submissions = list(map(json_to_submission, json.load(infile)))
    with open(SUBMISSIONS_OBJECTS_FILE, 'wb') as outfile:
        pickle.dump(submissions, outfile)

with open(COMMENTS_FILE, 'r') as infile:
    comments = list(map(json_to_comment, json.load(infile)))
    with open(COMMENTS_OBJECTS_FILE, 'wb') as outfile:
        pickle.dump(comments, outfile)

In [5]:
submission_id_map = {submission.id_: submission for submission in submissions}
submission_comments_map = defaultdict(dict)
for comment in comments:
    submission_comments_map[comment.submission_id][comment.id_] = comment
posts = [Post.construct_post(submission, submission_comments_map[submission_id])
         for submission_id, submission in submission_id_map.items()]
with open(POSTS_OBJECTS_FILE, 'wb') as outfile:
    pickle.dump(posts, outfile)

# Exploratory Data Analysis

In [6]:
# Analyse Submissions
with open(SUBMISSIONS_OBJECTS_FILE, 'rb') as infile:
    data = map(asdict, pickle.load(infile))
    submissions = pd.DataFrame.from_records(data)
    submissions['created_utc'] = pd.to_datetime(pd.to_numeric(submissions['created_utc']), unit='s')
# print(submissions.head())
print(submissions.info())
print()
print(submissions.describe())


submissions_by_user = submissions['author'].value_counts().reset_index()
submissions_by_user.columns = ['author', 'submissions']
submissions_by_user = submissions_by_user[submissions_by_user.author != '[deleted]']
print(submissions_by_user.head())
print()
bins = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 200, 500, 1000, 2000, float('inf')]
labels = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10-50', '50-100', '100-200', '200-500', '500-1000', '1000-2000', '2000+']
submissions_by_user['bin'] = pd.cut(submissions_by_user['submissions'], bins=bins, labels=labels, right=False)
bin_counts = submissions_by_user['bin'].value_counts().sort_index()
print(len(submissions_by_user))
print(bin_counts)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18971 entries, 0 to 18970
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id_           18971 non-null  object        
 1   subreddit     18971 non-null  object        
 2   title         18971 non-null  object        
 3   selftext      18971 non-null  object        
 4   url           18940 non-null  object        
 5   author        18971 non-null  object        
 6   ups           18971 non-null  int64         
 7   score         18971 non-null  int64         
 8   permalink     18971 non-null  object        
 9   created_utc   18971 non-null  datetime64[ns]
 10  num_comments  18971 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(7)
memory usage: 1.6+ MB
None

                ups         score                    created_utc  num_comments
count  18971.000000  18971.000000                          18971  18971.000000
mean    

In [7]:
# Analyse Comments
with open(COMMENTS_OBJECTS_FILE, 'rb') as infile:
    data = map(asdict, pickle.load(infile))
    comments = pd.DataFrame.from_records(data)
    comments['created_utc'] = pd.to_datetime(pd.to_numeric(comments['created_utc']), unit='s')
# print(submissions.head())
print(comments.info())
print()
print(comments.describe())

comments_by_user = comments['author'].value_counts().reset_index()
comments_by_user.columns = ['author', 'comments']
comments_by_user = comments_by_user[comments_by_user.author != '[deleted]']
print(comments_by_user.head())
print(len(comments_by_user))
print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22863 entries, 0 to 22862
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id_            22863 non-null  object        
 1   parent_id      22863 non-null  object        
 2   submission_id  22863 non-null  object        
 3   body           22863 non-null  object        
 4   author         22863 non-null  object        
 5   created_utc    22863 non-null  datetime64[ns]
 6   ups            22863 non-null  int64         
 7   score          22863 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 1.4+ MB
None

                         created_utc           ups         score
count                          22863  22863.000000  22863.000000
mean   2019-01-27 08:28:54.383239168      1.891834      1.824345
min              2012-02-01 18:13:16    -15.000000    -35.000000
25%       2017-04-12 19:13:22.500000      2.000

In [8]:
# Analyse Posts
with open(POSTS_OBJECTS_FILE, 'rb') as infile:
    posts = pickle.load(infile)
post_map = dict()
for post in posts:
    n_comments, max_depth = post.comment_stats()
    post_map[post.submission.id_] = {
        'num_comments': n_comments,
        'max_depth': max_depth,
    }
posts = pd.DataFrame.from_dict(post_map, orient='index').reset_index()
posts.rename(columns={'index': 'submission_id'}, inplace=True)
print(posts.describe())

top_commented_posts = posts.sort_values(by='num_comments', ascending=False).head(5)
print(top_commented_posts)

top_depth_posts = posts.sort_values(by='max_depth', ascending=False).head(5)
print(top_depth_posts)

       num_comments     max_depth
count  18971.000000  18971.000000
mean       1.205050      0.256866
std        3.628944      0.959136
min        0.000000      0.000000
25%        0.000000      0.000000
50%        0.000000      0.000000
75%        1.000000      0.000000
max       80.000000     17.000000
      submission_id  num_comments  max_depth
30         t3_pl0ko            80         14
16183     t3_pud5i4            80         16
1084      t3_2q583b            65         14
9219      t3_cpgbd6            63         10
13913     t3_lbvbr7            60          2
      submission_id  num_comments  max_depth
6152      t3_7yfyxj            27         17
13982     t3_lh0d9n            34         16
16183     t3_pud5i4            80         16
15243     t3_nhmfe0            37         15
16433     t3_qj7gpp            22         14


# Graph Visualisation

In [9]:
with open(SUBMISSIONS_OBJECTS_FILE, 'rb') as infile:
    submissions = pickle.load(infile)
submission_author_map = {submission.id_: submission.author
                         for submission in submissions}
with open(COMMENTS_OBJECTS_FILE, 'rb') as infile:
    comments = pickle.load(infile)
comment_author_map = {comment.id_: comment.author
                      for comment in comments}
with open(POSTS_OBJECTS_FILE, 'rb') as infile:
    posts = pickle.load(infile)

## Author Interaction Graph
* **Nodes**: Authors (both submission and comment authors).
* **Edges**: A directed edge from author A to author B represents author A commenting on author B’s post or comment.
* **Edge Weight**: The number of interactions between the two authors (i.e., how many times A has replied to B).

In [10]:
graph1 = nx.DiGraph()
graph1.add_nodes_from(submission_author_map.values())
graph1.add_nodes_from(comment_author_map.values())

interactions_counts = defaultdict(int)
for comment in comments:
    try:
        u = comment.author
        parent_id = comment.parent_id
        if parent_id.startswith(SUBMISSION_ID_PREFIX):
            v = submission_author_map[parent_id]
        else:
            v = comment_author_map[parent_id]
        interactions_counts[(u, v)] += 1
    except KeyError:
        continue

for (u, v), weight in interactions_counts.items():
    if weight >= INTERACTIONS_THRESHOLD:
        graph1.add_edge(u, v, weight=weight)
graph1.remove_node('[deleted]')
graph1.remove_nodes_from(list(nx.isolates(graph1)))

net = Network(notebook=True,
              directed=True,
              height='750px',
              width='100%',
              cdn_resources='in_line',
              select_menu=True,
              heading=f'Author Network (Edge Weight Threshold: {INTERACTIONS_THRESHOLD})')
net.force_atlas_2based()
net.show_buttons(filter_=['nodes', 'edges', 'physics'])
for node in graph1.nodes():
    net.add_node(node, label=node, value=graph1.degree(node) * 20,
                title=f'In-Degree: {graph1.in_degree(node)}\nOut-Degree: {graph1.out_degree(node)}' )  
for u, v, data in graph1.edges(data=True):
    net.add_edge(u, v, title=data['weight'], value=data['weight'])
net.show(f'{GRAPHS_DIR}author_interaction_graph.html')

graphs/author_interaction_graph.html


## Post Discussion Graph

* **Nodes**: Submission/Comment IDs.
* **Edges**: Represent replies within the discussion tree. A submission connects to its top-level comments. A comment connects to its direct replies, forming a hierarchical thread structure.
* **Node Size**: Represents the number of child replies.

In [11]:
graph2 = nx.DiGraph()

posts.sort(key=lambda p: (p.comment_stats()[0], -p.comment_stats()[1]), reverse=True)
for idx, post in enumerate(posts[:POSTS_THRESHOLD], start=1):
    n_comments, depth = post.comment_stats()
    graph2.add_node(post.submission.id_,
                    label=post.submission.author,
                    size=post.comment_stats()[0],
                    group=idx,
                    title=post.submission.title,
                    level=0)
    queue = deque(post.comments)
    while queue:
        node = queue.popleft()
        graph2.add_node(node.comment.id_,
                        label=node.comment.author,
                        size=max(len(node.children) * 20, 10),
                        group=idx,
                        title=node.comment.body,
                        level=node.depth + 1)
        graph2.add_edge(node.comment.parent_id, node.comment.id_)
        queue.extend(node.children)

net = Network(notebook=True,
              directed=True,
              height='750px',
              width='100%',
              cdn_resources='in_line',
              heading=f'Posts Comment Network (Number of Posts: {POSTS_THRESHOLD})')
net.from_nx(graph2)
net.force_atlas_2based()
net.show_buttons(filter_=['nodes', 'edges', 'physics'])
net.show(f'{GRAPHS_DIR}posts_comments_graph.html')

graphs/posts_comments_graph.html


# Network Analysis

In [12]:
with open(SUBMISSIONS_OBJECTS_FILE, 'rb') as infile:
    submissions = pickle.load(infile)
submission_author_map = {submission.id_: submission.author
                         for submission in submissions}
with open(COMMENTS_OBJECTS_FILE, 'rb') as infile:
    comments = pickle.load(infile)
comment_author_map = {comment.id_: comment.author
                      for comment in comments}
with open(POSTS_OBJECTS_FILE, 'rb') as infile:
    posts = pickle.load(infile)

## Metric 1: Superuser Influence – Connected Components & Rich Club Coefficient

In [13]:
graph = nx.Graph()
graph.add_nodes_from(submission_author_map.values())
graph.add_nodes_from(comment_author_map.values())

for comment in comments:
    try:
        u = comment.author
        parent_id = comment.parent_id
        if parent_id.startswith(SUBMISSION_ID_PREFIX):
            v = submission_author_map[parent_id]
        else:
            v = comment_author_map[parent_id]
        graph.add_edge(u, v)
    except KeyError:
        continue

graph.remove_node('[deleted]')
graph.remove_edges_from(list(nx.selfloop_edges(graph)))
graph.remove_nodes_from(list(nx.isolates(graph)))

degrees = nx.degree_centrality(graph)
sorted_users = sorted(degrees, key=degrees.get, reverse=True)

removed_users = []
rich_club_coefficients = []
n_connected_components = []
size_largest_cc = []
for i in range(NUMBER_OF_REMOVAL_STEPS+1):
    rich_club_coefficients.append(nx.rich_club_coefficient(graph, normalized=False))
    connected_components = list(nx.connected_components(graph))
    largest_component = max(connected_components, key=len)
    n_connected_components.append(len(connected_components))
    size_largest_cc.append(len(largest_component))
    removed_users.append(sorted_users[i])
    graph.remove_node(sorted_users[i])
    graph.remove_nodes_from(list(nx.isolates(graph)))  

metrics_df = pd.DataFrame({
    'user': removed_users,
    'size_largest_cc': size_largest_cc,
    'n_connected_components': n_connected_components,
    'rich_club_coefficients': rich_club_coefficients
})
rich_club_long = []
for i, (user, coeffs) in enumerate(zip(removed_users, rich_club_coefficients)):
    for degree, value in coeffs.items():
        rich_club_long.append({'user': user, 'degree': degree, 'coefficient': value})
rich_club_df = pd.DataFrame(rich_club_long)

bar_selection = alt.selection_point(encodings=['x'])
legend_selection = alt.selection_point(fields=['user'], bind='legend')
combined_selection = bar_selection & legend_selection

base = alt.Chart(metrics_df).encode(
    x=alt.X('user:N', title='With User', sort='y')
)
bar = base.mark_bar(color='steelblue').encode(
    y=alt.Y('n_connected_components:Q', title='Number of Connected Components'),
    tooltip=['user', 'size_largest_cc', 'n_connected_components'],
    opacity=alt.condition(combined_selection, alt.value(1), alt.value(0.3))
).add_params(bar_selection)
line = base.mark_line(color='red').encode(
    y=alt.Y('size_largest_cc:Q', title='Size of Largest Component', axis=alt.Axis(titleColor='red')),
)
chart = alt.layer(bar, line).resolve_scale(y='independent').properties(
    title='Impact of Removing Users on Network Structure',
    width=500
).add_params(legend_selection)

line_chart = alt.Chart(rich_club_df).mark_line(point=True).encode(
    x=alt.X('degree:Q', title='Node Degree'),
    y=alt.Y('coefficient:Q', title='Rich Club Coefficient'),
    color=alt.Color('user:N', title='User'),
    tooltip=['user', 'degree', 'coefficient'],
    opacity=alt.condition(combined_selection, alt.value(1), alt.value(0.3))
).add_params(legend_selection).properties(
    title='Rich Club Coefficient Across Node Degrees',
    width=500
)

figure = chart | line_chart
figure.save(f'{CHARTS_DIR}Superuser-Influence.html')
figure.show()

## Metric 2: Questioner vs. Answerer Z-Score

In [14]:
reply_counts = defaultdict(lambda: {'Q': 0, 'A': 0})
for comment in comments:
    try:
        a = comment.author
        parent_id = comment.parent_id
        if parent_id.startswith(SUBMISSION_ID_PREFIX):
            q = submission_author_map[parent_id]
        else:
            q = comment_author_map[parent_id]
        reply_counts[a]['A'] += 1
        reply_counts[q]['Q'] += 1
    except KeyError:
        continue

total_replies = 0
total_questions = 0
total_answers = 0
for replies in reply_counts.values():
    total_questions += replies['Q']
    total_answers += replies['A']
    total_replies += replies['Q'] + replies['A']

P_q = total_questions / total_replies
P_a = total_answers / total_replies
print(f'P_q = {P_q}\tP_a = {P_a}')

z_scores = {}
for user, replies in reply_counts.items():
    a, q = replies['A'], replies['Q']
    z_score = (2 * (a - q)) / (sqrt(a + q))
    z_scores[user] = z_score
z_scores.pop('[deleted]', None)

metrics_df = pd.DataFrame([{
    'user': user,
    'questions': reply_counts[user]['Q'],
    'answers': reply_counts[user]['A'],
    'num_messages': reply_counts[user]['A'] + reply_counts[user]['Q'],
    'z_score': score,
    'user_type': 'Answerer' if score > 0 else 'Equal' if score == 0 else 'Questioner'
} for user, score in z_scores.items()])

metrics_df = metrics_df[metrics_df['num_messages'] >= Z_SCORE_MIN_MESSAGES]
# Scatter plot with regression line
scatter = alt.Chart(metrics_df).mark_circle(size=30, opacity=0.5).encode(
    x=alt.X('num_messages:Q', title='Number of Messages a User Posted', scale=alt.Scale(type='log')),
    y=alt.Y('z_score:Q', title='Modified Z-Score'),
    tooltip=['user', 'questions', 'answers', 'z_score']
).properties(
    width=600, height=400
)
regression = scatter.transform_regression('num_messages', 'z_score').mark_line(color='blue')

# Density plot at the top
hist = alt.Chart(metrics_df).transform_density(
    density='num_messages',
    as_=['num_messages', 'density'],
).mark_area(opacity=0.3).encode(
    x=alt.X('num_messages:Q', title=None, scale=alt.Scale(type='log')),
    y=alt.Y('density:Q', title='No. Users', axis=alt.Axis(labels=False))
).properties(
    width=600, height=400
)

figure = alt.vconcat(
    hist, scatter + regression, spacing=5
).interactive()
figure.save(f'{CHARTS_DIR}Z-Score.html')
figure.show()

P_q = 0.5	P_a = 0.5


## Metric 3: Virality Score (Cascade Size & Lifespan)

In [15]:
submission_links = []
cascade_sizes = []
lifespans = []
virality_scores = []
for post in posts:
    t_0 = post.submission.created_utc
    t_max = t_0
    comments = post.get_all_comments()
    c_p = len(comments)
    for comment in comments:
        t_max = max(t_max, comment.created_utc)
    
    l_p = t_max - t_0
    v_p = log(1 + c_p) * log(1 + l_p)
    if v_p != 0:
        submission_links.append(post.submission.link)
        cascade_sizes.append(c_p)
        lifespans.append(l_p)
        virality_scores.append(v_p)
metrics_df = pd.DataFrame({
    'submission': submission_links,
    'cascade_size': cascade_sizes,
    'lifespan': lifespans,
    'virality_score': virality_scores
})

metrics_df = metrics_df[metrics_df['cascade_size'] > VIRALITY_MIN_CASCADE_SIZE]
selection = alt.selection_interval()
scatter1 = alt.Chart(metrics_df).mark_circle().encode(
    x=alt.X('cascade_size:Q', scale=alt.Scale(type='log'), title='Cascade Size'),
    y=alt.Y('virality_score:Q', scale=alt.Scale(type='log'), title='Virality Score'),
    color=alt.Color('virality_score:Q', scale=alt.Scale(scheme='viridis')),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    tooltip=['submission', 'cascade_size', 'virality_score'],
    href='submission:N'
).properties(
    width=500, height=400, title='Cascade Size vs. Virality Score'
).add_params(selection)
scatter2 = scatter1.encode(
    x=alt.X('lifespan:Q', scale=alt.Scale(type='log'), title='Lifespan (Seconds)'),
    tooltip=['submission', 'lifespan', 'virality_score'],
).properties(title='Lifespan vs. Virality Score')
figure = scatter1 | scatter2
figure.properties(title='Virality Analysis (Ctrl+Click to Open Post)')
figure.save(f'{CHARTS_DIR}Virality-Score.html')
figure.show()

# Additional Research Questions

## Temporal Trends

### Distribution of Submissions over Time (Years & Months)

In [16]:
with open(SUBMISSIONS_OBJECTS_FILE, 'rb') as infile:
    data = map(asdict, pickle.load(infile))
    submissions = pd.DataFrame.from_records(data)
    submissions['created_utc'] = pd.to_datetime(pd.to_numeric(submissions['created_utc']), unit='s')

submissions = submissions.assign(
    created_utc_year=submissions['created_utc'].dt.year,
    created_utc_month=submissions['created_utc'].dt.month
)

year_select = alt.selection_point(fields=['created_utc_year'])
submissions_over_time = submissions.groupby('created_utc_year').size().reset_index(name='count')
chart = alt.Chart(submissions_over_time).mark_bar().encode(
    alt.X('created_utc_year:O', title='Year'),
    alt.Y('count:Q', title='Number of Submissions'),
    alt.Tooltip(['created_utc_year:O', 'count:Q'])
).properties(
    title='Submissions Over Time by Year',
    width=800,
    height=400
).add_params(
    year_select
)
submissions_by_year_month = submissions.groupby(['created_utc_year', 'created_utc_month']).size().reset_index(name='count')
heatmap = alt.Chart(submissions_by_year_month).mark_rect().encode(
    alt.X('created_utc_year:O', title='Year'),
    alt.Y('created_utc_month:O', title='Month'),
    alt.Color('count:Q', scale=alt.Scale(scheme='viridis'), title='Number of Submissions'),
    alt.Tooltip(['created_utc_year:O', 'created_utc_month:O', 'count:Q']),
    opacity=alt.condition(year_select, alt.value(1), alt.value(0.2))
).properties(
    title='Submissions Over Time (Year and Month)',
    width=800,
    height=400
)
figure = heatmap & chart
figure.save(f'{CHARTS_DIR}Submissions-Year-Month.html')
figure.show()

### Distribution of Submissions over Time (Day & Hour)

In [17]:
submissions = submissions.assign(
    created_utc_hour=submissions['created_utc'].dt.hour,
    created_utc_day=submissions['created_utc'].dt.day_name()
)

day_hour_counts = submissions.groupby(['created_utc_hour', 'created_utc_day']).size().reset_index(name='post_count')
day_avg_scores = submissions.groupby(['created_utc_hour', 'created_utc_day'])['score'].mean().reset_index(name='avg_score')

point_select = alt.selection_point(fields=['created_utc_day', 'created_utc_hour'], bind='legend')
scatter_plot1 = alt.Chart(day_hour_counts).mark_rect().encode(
    x=alt.X('created_utc_day:N', title='Day of Week', sort=DAYS_OF_THE_WEEK),
    y=alt.Y('created_utc_hour:O', title='Hour of Day'),
    color=alt.Color('post_count:Q', scale=alt.Scale(scheme='viridis')),
    tooltip=['created_utc_day', 'created_utc_hour', 'post_count'],
    opacity=alt.condition(point_select, alt.value(1), alt.value(0.2))

).properties(
    title='Submissions Over Time (Day & Hour)',
    width=400,
    height=600
).add_params(
    point_select
)
scatter_plot2 = alt.Chart(day_avg_scores).mark_rect().encode(
    x=alt.X('created_utc_day:O', title='Day of Week', scale=alt.Scale(domain=DAYS_OF_THE_WEEK)),
    y=alt.Y('created_utc_hour:O', title='Hour of Day'),
    color=alt.Color('avg_score:Q', scale=alt.Scale(scheme='viridis')),
    tooltip=['created_utc_day', 'created_utc_hour', 'avg_score'],
    opacity=alt.condition(point_select, alt.value(1), alt.value(0.2)) 
).properties(
    title='Total Score Distribution Over Day of Week and Hour of Day',
    width=400,
    height=600
).add_params(
    point_select
)

figure = alt.hconcat(scatter_plot1, scatter_plot2).resolve_scale(color='independent')
figure.save(f'{CHARTS_DIR}Submissions-Day-Hour.html')
figure.show()

### Distribution of Posts by Top Users

In [18]:
top_users = submissions[submissions['author'] != '[deleted]']
top_users = top_users['author'].value_counts().nlargest(TOP_N_USERS).index
submissions_top = submissions[submissions['author'].isin(top_users)]
submissions_top = submissions_top.sort_values(by=['created_utc'])
submissions_top['cumulative_count'] = submissions_top.groupby('author').cumcount() + 1
author_counts = submissions[submissions['author'].isin(top_users)]['author'].value_counts()
submissions_top['author_count'] = submissions_top['author'].map(author_counts)

selection = alt.selection_point(fields=['author'], bind='legend')
figure = alt.Chart(submissions_top).mark_line().encode(
    x=alt.X('created_utc:T', title='Time'),
    y=alt.Y('cumulative_count:Q', title='Cumulative Number of Posts', scale=alt.Scale(type='log')),
    color=alt.Color('author:N', title='User', sort=author_counts.sort_values(ascending=False).index.tolist()),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    tooltip=alt.Tooltip(['created_utc:T', 'cumulative_count:Q', 'author:N'])
).properties(
    title=f'Cumulative Number of Submissions Over Time for Top {TOP_N_USERS} Users',
    width=800,
    height=400
).add_params(
    selection
)
figure.save(f'{CHARTS_DIR}Submissions-By-Users.html')
figure.show()

## Content Analysis

In [19]:
with open(SUBMISSIONS_OBJECTS_FILE, 'rb') as infile:
    submissions = pickle.load(infile)

In [20]:
submission_titles = [submission.title for submission in submissions]
STOP_WORDS = set(stopwords.words('english'))

def tokenize(text):
    """Tokenize, remove stopwords, and lowercase the text."""
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [token for token in tokens if token.isalnum() and token not in STOP_WORDS]
    return ' '.join(cleaned_tokens)
cleaned_texts = list(map(tokenize, submission_titles))
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(cleaned_texts)

kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
kmeans.fit(X)
terms = vectorizer.get_feature_names_out()
centroids = kmeans.cluster_centers_
top_terms_map = {}
for i in range(NUM_CLUSTERS):
    top_terms_indices = centroids[i].argsort()[-TOP_N_TERMS:][::-1]
    top_terms = [terms[idx] for idx in top_terms_indices]
    top_terms_map[i] = ', '.join(top_terms)

pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(X.toarray())

df = pd.DataFrame({
    'submission': submission_titles,
    'x': reduced_vectors[:, 0],
    'y': reduced_vectors[:, 1],
    'cluster': kmeans.labels_,
})
df['top_terms'] = df['cluster'].apply(lambda c: f"{top_terms_map[c]}")

cluster_selection = alt.selection_point(fields=['top_terms'], bind='legend')
figure = alt.Chart(df).mark_circle(size=60).encode(
    x=alt.X('x:Q'),
    y=alt.Y('y:Q'),
    color=alt.Color('top_terms:N', title='Clusters'),
    tooltip=['submission', 'top_terms'],
    opacity=alt.condition(cluster_selection, alt.value(1), alt.value(0)) 
).add_params(
    cluster_selection
).properties(
    title='KMeans Topic Clusters',
    width=800,
    height=400
).interactive()
figure.save(f'{CHARTS_DIR}KMeans-Topic-Clusters.html')
figure.show()