In [1]:
import pandas as pd
from jupyter_dash import JupyterDash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px  

from dash import dash_table

app = JupyterDash(__name__)


## Interactive Scatter Plot and Bubble Chart for Community Interaction Analysis

Scatter Plot: Visualize the relationship between the number of comments (num_comments) and post score (score) to see if there's a correlation between engagement and the community's reception of a post. Each point could represent a post, with the subreddit as the color category.

Bubble Chart: Extend the scatter plot by incorporating the upvote_ratio as the size of each bubble. This would give insight into not only the engagement and reception but also how divisive or universally liked the posts are within each subreddit.

### Get Datasets Ready

In [2]:
sampled_ptsd_df = pd.read_csv('../Engagement Analysis Datasets/sampled_ptsd.csv', parse_dates=['created_utc'], index_col='created_utc')
sampled_adhd_df = pd.read_csv('../Engagement Analysis Datasets/sampled_adhd.csv', parse_dates=['created_utc'], index_col='created_utc')
sampled_apg_df = pd.read_csv('../Engagement Analysis Datasets/sampled_apg.csv', parse_dates=['created_utc'], index_col='created_utc')
sampled_dps_df = pd.read_csv('../Engagement Analysis Datasets/sampled_dps.csv', parse_dates=['created_utc'], index_col='created_utc')
sampled_ocd_df = pd.read_csv('../Engagement Analysis Datasets/sampled_ocd.csv', parse_dates=['created_utc'], index_col='created_utc')

In [3]:
datasets = {
    'PTSD': sampled_ptsd_df,
    'ADHD': sampled_adhd_df,
    'OCD': sampled_ocd_df,
    'Depression': sampled_dps_df,
    'Aspergers': sampled_apg_df
}

#print the stat for the 'num_comments' column
for subreddit, df in datasets.items():
    print(f"Statistics for {subreddit} subreddit:")
    print(df['num_comments'].describe(), '\n')


Statistics for PTSD subreddit:
count    10000.000000
mean         7.203900
std         13.116849
min          0.000000
25%          1.000000
50%          3.000000
75%          7.000000
max        245.000000
Name: num_comments, dtype: float64 

Statistics for ADHD subreddit:
count    10000.000000
mean        14.051400
std         64.373999
min          0.000000
25%          2.000000
50%          4.000000
75%          7.000000
max       1909.000000
Name: num_comments, dtype: float64 

Statistics for OCD subreddit:
count    10000.000000
mean         4.984800
std         12.424162
min          0.000000
25%          1.000000
50%          2.000000
75%          5.000000
max        466.000000
Name: num_comments, dtype: float64 

Statistics for Depression subreddit:
count    10000.000000
mean         3.099700
std          9.834634
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max        362.000000
Name: num_comments, dtype: float64 

Statistics for Aspe

Filtering for comments above 75% threshold

PTSD: More than 7 comments

ADHD: More than 7 comments

OCD: More than 5 comments

Depression: More than 3 comments

Aspergers: More than 16 comments


In [4]:
## Filtering

filtered_ptsd_df = sampled_ptsd_df[sampled_ptsd_df['num_comments'] > 7]
filtered_adhd_df = sampled_adhd_df[sampled_adhd_df['num_comments'] > 7]
filtered_ocd_df = sampled_ocd_df[sampled_ocd_df['num_comments'] > 5]
filtered_dps_df = sampled_dps_df[sampled_dps_df['num_comments'] > 3]
filtered_apg_df = sampled_apg_df[sampled_apg_df['num_comments'] > 16]

In [5]:
#then combine
combined_filtered_df = pd.concat([
    filtered_ptsd_df,
    filtered_adhd_df,
    filtered_ocd_df,
    filtered_dps_df,
    filtered_apg_df
])


In [6]:
app.layout = html.Div([
    dcc.Dropdown(
        id='subreddit-dropdown',
        options=[{'label': i, 'value': i} for i in combined_filtered_df['subreddit'].unique()],
        value=combined_filtered_df['subreddit'].unique().tolist(),  # default value
        multi=True
    ),
    dcc.Graph(id='interaction-scatter-plot')
])


@app.callback(
    Output('interaction-scatter-plot', 'figure'),
    [Input('subreddit-dropdown', 'value')]
)
def update_figure(selected_subreddits):
    filtered_df = combined_filtered_df[combined_filtered_df['subreddit'].isin(selected_subreddits)]
    
    fig = px.scatter(
        filtered_df, 
        x='num_comments', 
        y='score', 
        size='upvote_ratio',
        color='subreddit', 
        hover_name='title',
        title='Community Interaction Analysis'
    )
    
    fig.update_layout(transition_duration=500)
    return fig


if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8051) 
